Preamble

This document was last updated on April 19, 2021.

Jason Pither & Faye Manning authored this annotated R Markdown script. This script aims to maximize the accessibility and reproducibility of the analyses associated with the publication:

Manning, F.S., Curtis, P.J., Walker, I.R. & Pither, J. (2021). Potential long-distance dispersal of freshwater diatoms adhering to waterfowl plumage. Freshwater Biology. https://doi.org/10.1111/fwb.13706.

This script is licensed under CC BY-NC-SA 4.0 explained here. If you use this material please cite it as:

Pither, J. and Manning, F.S. (2020). Data and scripts for: Potential long-distance dispersal of freshwater diatoms adhering to waterfowl plumage. DOI 10.17605/OSF.IO/UJNW2

Session info and setup

NOTE: With the exception of base package functions and those in the ggplot2 and patchwork packages, all functions are preceded by their package sources, for example:

dplyr::filter(mydata, myvar = "yes")

Load packages

library(ggplot2)
library(dplyr)
library(minpack.lm)
library(investr)
library(ggeffects)
library(riem)
library(rdryad)
library(sf)
library(rmapshaper)
library(pscl)
library(snakecase)
library(spatstat)
library(lubridate)
library(sp)
library(gstat)
library(raster)
library(ggspatial)
library(patchwork)
library(ggsn)
library(bigleaf)
library(knitr)
library(kableExtra)
library(tidyr)

If you wish to view all installed packages in the renv lockfile:

all.installed.packages <- sort(row.names(installed.packages()))
all.installed.packages

Key information for reproducibility

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tidyr_1.0.0.9000    kableExtra_1.3.1    knitr_1.22         
##  [4] bigleaf_0.7.1       ggsn_0.5.0          patchwork_1.0.0    
##  [7] ggspatial_1.0.3     raster_2.8-19       gstat_2.0-6        
## [10] sp_1.3-1            lubridate_1.7.8     spatstat_1.61-0    
## [13] rpart_4.1-15        nlme_3.1-144        spatstat.data_1.4-0
## [16] snakecase_0.11.0    pscl_1.5.2          rmapshaper_0.4.1   
## [19] sf_0.9-3            rdryad_0.4.0        riem_0.1.1         
## [22] ggeffects_0.9.0     investr_1.4.0       minpack.lm_1.2-1   
## [25] dplyr_0.8.3         ggplot2_3.3.1      
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6          xts_0.12-0            oai_0.3.0            
##  [4] webshot_0.5.2         insight_0.2.0         httr_1.4.2           
##  [7] tools_3.6.3           R6_2.4.1              sjlabelled_1.0.17    
## [10] KernSmooth_2.23-16    DBI_1.0.0             mgcv_1.8-31          
## [13] colorspace_1.4-1      withr_2.3.0           tidyselect_0.2.5     
## [16] curl_4.3              compiler_3.6.3        rvest_0.3.6          
## [19] xml2_1.2.0            scales_1.0.0          DEoptimR_1.0-8       
## [22] robustbase_0.93-6     classInt_0.4-3        goftest_1.1-1        
## [25] stringr_1.4.0         digest_0.6.25         foreign_0.8-75       
## [28] solrium_1.0.2         spatstat.utils_1.13-0 rmarkdown_2.5        
## [31] jpeg_0.1-8.1          pkgconfig_2.0.3       htmltools_0.3.6      
## [34] jsonvalidate_1.1.0    rlang_0.4.10          rstudioapi_0.11      
## [37] httpcode_0.2.0        FNN_1.1.3             generics_0.0.2       
## [40] zoo_1.8-5             jsonlite_1.7.2        magrittr_1.5         
## [43] Matrix_1.2-18         Rcpp_1.0.4.6          munsell_0.5.0        
## [46] abind_1.4-5           lifecycle_0.1.0       stringi_1.4.3        
## [49] yaml_2.2.0            MASS_7.3-51.5         plyr_1.8.4           
## [52] maptools_0.9-5        sjmisc_2.7.9          forcats_0.4.0        
## [55] crayon_1.3.4          deldir_0.1-25         lattice_0.20-38      
## [58] haven_2.1.0           splines_3.6.3         tensor_1.5           
## [61] hms_0.4.2             pillar_1.4.2          rjson_0.2.20         
## [64] spacetime_1.2-3       geojsonlint_0.4.0     codetools_0.2-16     
## [67] crul_0.7.4            glue_1.4.1            evaluate_0.13        
## [70] V8_3.0.2              solartime_0.0.1       vctrs_0.2.0.9007     
## [73] png_0.1-7             RgoogleMaps_1.4.5.3   gtable_0.3.0         
## [76] purrr_0.3.3           polyclip_1.10-0       assertthat_0.2.1     
## [79] xfun_0.19             e1071_1.7-1           viridisLite_0.3.0    
## [82] class_7.3-15          tibble_2.1.3          intervals_0.15.1     
## [85] units_0.6-6           ggmap_3.0.0

Define functions

For the calibration curve work below, we use functions from this webpage:

http://weightinginbayesianmodels.github.io/poctcalibration/AMfunctions.html#curve-functions.

Here is the function for the 4 parameter calibration model:

# 4PL model
M.4pl <- function(x, small.x.asymp, inf.x.asymp, inflec, hill){
    f <- small.x.asymp + ((inf.x.asymp - small.x.asymp) /
                              (1 + (x / inflec)^hill))
    return(f)
}

# 5PL model  
M.5pl <- function(x, small.x.asymp, inf.x.asymp, c.5pl, hill, g.5pl){
    f <- small.x.asymp + ((inf.x.asymp - small.x.asymp)/
                              (1 + (x / c.5pl)^hill)^g.5pl)
    return(f)
}

# And their inverse (for prediction):
Inv.4pl <- function(y, small.x.asymp, inf.x.asymp, inflec, hill){
    f <- inflec * ((inf.x.asymp - small.x.asymp) / 
                       (y - small.x.asymp) - 1)^(1 / hill)
    names(f) <- "x.hat"
    return(f)
} 

Inv.5pl <- function(y, small.x.asymp, inf.x.asymp, c.5pl, hill, g.5pl){
    f <- c.5pl * (((inf.x.asymp - small.x.asymp) / 
                       (y - small.x.asymp))^(1/g.5pl) - 1)^(1 / hill)
    names(f) <- "x.hat"
    return(f)
} 

The diagnostic plot function for the calibration curve model was accessed from this webpage:

http://weightinginbayesianmodels.github.io/poctcalibration/AMfunctions.html#curve-functions.

The script was edited to include an argument (param) specifying 4 or 5 parameter model.

# all functions herein are from `base` and `graphics` 
plotDiag.nls <- function(nlsLM.model, title.top, param = 4){
  par(mfcol = c(1, 2), oma = c(0.5, 0.5, 2, 0))
  # adapted from Brandon Greenwell's investr functions
  if (param == 5) {
    data <- eval(nlsLM.model$data)
    x.names <- intersect(all.vars(formula(nlsLM.model)[[3]]), colnames(data))
    y.names <- all.vars(formula(nlsLM.model)[[2]])
    x <- data[, x.names]  # extract predictor columns
    x.nz.min <- min(x[x != 0])
    # Display purposes, we cheat a little to get the zero calibrators included on the
    # log(x) plot
    x.fix <- ifelse(x <= 0, x.nz.min/5, x)
    break.x <- x.nz.min/4
    y <- data[, y.names]  # extract response columns
    # Plot data and fitted curve
    plot(x.fix, y, log = "x", main = "data and fitted curve", pch = 20,
         ylab = "Intensity", xlab = "log(Count)", font.main = 3)
    grid()
    curve(M.5pl(x, coef(nlsLM.model)[[1]], coef(nlsLM.model)[[2]],
                coef(nlsLM.model)[[3]], coef(nlsLM.model)[[4]], coef(nlsLM.model)[[5]]), add = T)
    # Technically, we should not include the zero-calibrators on a log plot, but it's nice
    # to have for visualizing the results. This line inserts a break in the x-axis as in
    # Dudley et al (1985)
    plotrix::axis.break(1, break.x, brw = 0.05)
    # Plot standardised weighted residuals
    # [add ifelse condition for weighted and unweighted models (title)]
    std.w.resid <- summary(nlsLM.model)$resid/sd(summary(nlsLM.model)$resid)
    plot(predict(nlsLM.model), std.w.resid, ylab = "std residuals (in SDs)",
         xlab = "fitted response values", pch = 20,
         main = "standardized residuals", font.main = 3)
    # Horizontal lines at y=0 and +/- 2SD
    abline(h = 0, lty = 3, col = "red")
    abline(h = 2, lty = 3)
    abline(h = -2, lty = 3)
    title(main = "", outer = TRUE)
    par(mfcol = c(1, 1)) }
  else {
    data <- eval(nlsLM.model$data)
    x.names <- intersect(all.vars(formula(nlsLM.model)[[3]]), colnames(data))
    y.names <- all.vars(formula(nlsLM.model)[[2]])
    x <- data[, x.names]  # extract predictor columns
    x.nz.min <- min(x[x != 0])
    # Display purposes, we cheat a little to get the zero calibrators included on the
    # log(x) plot
    x.fix <- ifelse(x <= 0, x.nz.min / 5, x)
    break.x <- x.nz.min/4
    y <- data[, y.names]  # extract response columns
    # Plot data and fitted curve
    plot(x.fix, y, log = "x", main = "data and fitted curve", pch = 20,
         ylab = "Intensity", xlab = "log(Count)", font.main = 3)
    grid()

    curve(M.4pl(x, coef(nlsLM.model)[[1]], coef(nlsLM.model)[[2]],
                coef(nlsLM.model)[[3]], coef(nlsLM.model)[[4]]), add = T)
    # Technically, we should not include the zero-calibrators on a log plot, but it's nice
    # to have for visualizing the results. This line inserts a break in the x-axis as in
    # Dudley et al (1985)
    plotrix::axis.break(1, break.x, brw = 0.05)
    # Plot standardised weighted residuals
    # [add ifelse condition for weighted and unweighted models (title)]
    std.w.resid <- summary(nlsLM.model)$resid/sd(summary(nlsLM.model)$resid)
    plot(predict(nlsLM.model), std.w.resid, ylab = "std residuals (in SDs)",
         xlab = "fitted response values", pch = 20,
         main = "standardized residuals", font.main = 3)
    # Horizontal lines at y=0 and +/- 2SD
    abline(h = 0, lty = 3, col = "red")
    abline(h = 2, lty = 3)
    abline(h = -2, lty = 3)
    title(main = "", outer = TRUE)
    par(mfcol = c(1, 1))
  }
}

Experiment analyses

This section of the script deals exclusively with the data collected as part of the diatom dispersal experiment.

Data collection methods

An Excel spreadsheet was prepared before the experiment for data entry.

At the start of each experiment day, a sample was taken from the culture and used to determine the intensity (spectrofluorometer reading) of the culture.

During each experiment day, the hygrometer/thermometer recorded real-time humidity and temperature data, once a minute. Mean actual humidity and temperature was determined for each experimental unit and entered into the Excel spreadsheet (note that the number of recordings used to determine the mean for each experimental unit depended on the time treatment; e.g. 10min treatment mean determined from 10 recordings).

Two weeks after treatment, the intensity of the medium for each experimental unit i.e. raw observations from the spectrofluorometer were entered into the Excel spreadsheet.

NOTE: Ideally, the raw logger data from the sensors would have been used as input into this script (see file “logger.csv” in “rawdata” folder), but alas, that did not happen. This is the one part of the data wrangling that is not 100% computationally reproducible. Lesson learned!


Import data

Read in data

exp_data <- read.csv("../rawdata/experiment_data.csv")
hum_over_time <- read.csv("../rawdata/humidity_over_time_10.csv")
dilutions <- read.csv("../rawdata/serial_dilutions.csv")

Data file descriptions

Descriptions of raw data files:

experiment_data - experimental data
humidity_over_time_10 - humidity readings for one set of replicates at 10min treatment
serial_dilutions - spectrofluorometer data and corresponding cell counts for multiple 2-fold dilutions

In the experiment_data dataframe:

Description of each variable:

  • id: the unique identifier for each experimental unit
  • rep: replicate i.e. experiment day
  • type: denotes which group the experimental unit is a part of (e.g. control group)
  • apparatus: specifies which of the four apparatuses the experimental unit was on
  • time_min: assigned time treatment
  • target_hum: assigned relative humidity treatment
  • actual_hum_mean: mean relative humidity, as determined from hygrometer readings
  • temp_C: mean temperature in degrees Celsius, as determined from hygrometer readings
  • int_2wk: intensity (spectrofluorometer reading) two weeks after treatment
  • culture_int: intensity (spectrofluorometer reading) of culture used in experiment, taken at start of each experiment day

We have 3 different types of “control” observations, originally coded differently in the dataframe, using the target_hum variable:

  • The method control: where feather was dipped in (presumably) sterile media; this tests whether feather had anything on it (target_hum = “#N/A” and type = “control”)
  • The paper control: where paper was dipped in sterile media; this verifies sterility of media (target_hum = “#N/A” and type = “pcontrol”)
  • The treatment control: where feather was dipped in inoculated media, then not subjected to either time or humidity treatments (target_hum = “#N/A” and type = “treatment”)

The latter group of observations was not used, as it did not serve a useful purpose and did not fit with the experimental design.

Description of each field (variable) in the hum_over_time dataframe:

  • target_hum: assigned relative humidity treatment
  • min: indicates which minute (of ten) the hygrometer reading was taken
  • actual_hum: hygrometer readings for relative humidity (one reading recorded per minute)

Description of each field (variable) in the serial_dilutions dataframe:

  • int: intensity (spectrofluorometer reading)
  • count: diatom cell count (see “Estimating cell density” section for details)
  • dil: culture dilution (dilutions made with growth medium)

Data wrangling

Reorganize data

First, in the exp_data dataframe, the target_hum variable needs to be reorganized, as it’s not ordered, and it has “#N/A”.

These `#N/A" values correspond to when the feather or paper was simply put in the apparatus then taken out immediately. So, let’s reorganize the variable.

We end up with a new ordered factor for the target humidity, and proper NA values for the paper control (further details below).

exp_data$target_hum_factor <- NA
exp_data$target_hum_factor <- ifelse(exp_data$type == "control", "Control", 
    ifelse(exp_data$target_hum == "0" & exp_data$type != "pcontrol", "Dry",
      ifelse(exp_data$target_hum == "35", "Mid_Low",
        ifelse(exp_data$target_hum == "70", "Mid_High",
          "High"))))

Set the humidity observations associated with the pcontrol data to “NA”, as they did not undergo humidity treatment

exp_data[exp_data$type == "pcontrol", "target_hum_factor"] <- NA;
exp_data$target_hum_factor  <- ordered(exp_data$target_hum_factor, 
                levels = c("Control","High", "Mid_High", "Mid_Low", "Dry"))

Make character and numeric RH variables

exp_data$target_hum <- as.character(exp_data$target_hum)
exp_data$target_hum_num <- as.numeric(exp_data$target_hum)

Calculate difference between target and actual RH

exp_data$hum_diff <- 100*exp_data$actual_hum_mean - as.numeric(exp_data$target_hum_num)

Create dataframe with both true (feather) controls and paper controls

paper.vs.regular.controls <- data.frame(
  type = rep(c("Feather", "Paper"), each = 8),
  intensity = c(filter(exp_data, type == "control")$int_2wk, 
                filter(exp_data, type == "pcontrol")$int_2wk)
)

Ancillary data analysis

Relative humidity data

RH varied a little from run to run, which is to be expected. Note that our lowest and highest target RH levels were “near-0” and “near-100”.

Below we show our average measured RH data.

First let’s visualize the measured RH in relation to target RH

ggplot(data = dplyr::filter(exp_data, type == "treatment", time_min != 0), 
       aes(x = as.factor(target_hum_num), y = hum_diff, color = as.factor(time_min))) +
  geom_jitter(position=position_dodge(0.3), size = 3, shape = 1) +
  theme_bw() +
  labs(x = "Target relative humidity (%)", 
       y = "Measured relative humidity minus target relative humidity (%)") +
  guides(color = guide_legend(title = "Time (min)")) 
Difference between measured RH and target RH, for each time treatment

Difference between measured RH and target RH, for each time treatment

We note that there is systematic variation in the difference between target and actual humidity across treatment combinations. In particular, the group including 10min observations has actual humidity levels that are systematically higher than others for the near-zero to 70% target RH groups.

This issue is a function of the experimental setup, and can’t really be helped. The hygrometers were set up to record the air leaving the humidity chamber rather than the humidity within the chamber itself, as this was the only way the hygrometers could physically be incorporated into the apparatus. Each day, the systems were ‘warmed up’ by running them for 30mins to check that the air entering the chambers was at the desired target humidity. During the experiments, however, the actual humidity readings were unavoidably affected by the water evaporating off the feathers and leaving the system. Thus, the initial evaporation of the water off feathers caused initial increases in RH, but these settled down with time. However, with the 10min treatments, this settling down did not complete, hence the bias up towards higher RH.

Now let’s calculate descriptive stats for each of the target humidity groups (thus n = 32 per group) for both RH and temperature.

humidity.stats <- dplyr::filter(exp_data, target_hum_num != "NA") %>%
  dplyr::group_by(target_hum_num) %>%
  dplyr::summarize(mean.hum = mean(100*actual_hum_mean, na.rm = T), 
            sampsize = length(actual_hum_mean), 
            se.hum = sd(100*actual_hum_mean)/sqrt(length(actual_hum_mean)), 
            low.cl = t.test(100*actual_hum_mean, na.rm = T)$conf[1], 
            hi.cl = t.test(100*actual_hum_mean, na.rm = T)$conf[2])

Show the table output

options(knitr.kable.NA = '') # suppress showing NA values in table
kable(humidity.stats, booktabs = T, digits = 4,
      caption = "Summary statistics for relative humidity for the experimental runs.", 
      align = "rrrrr") %>%
  kable_styling(latex_options = c("striped", "hold_position"))
Summary statistics for relative humidity for the experimental runs.
target_hum_num mean.hum sampsize se.hum low.cl hi.cl
0 8.1432 32 0.3467 7.4361 8.8504
35 35.8961 32 0.5399 34.7950 36.9972
70 71.1343 32 0.3473 70.4260 71.8426
100 88.4094 32 0.4411 87.5097 89.3091

Temperature data

Descriptive stats

temp.stats <- dplyr::filter(exp_data, target_hum_num != "NA") %>%
  dplyr::group_by(target_hum_num) %>%
  dplyr::summarize(mean.temp = mean(temp_C, na.rm = T), 
            sampsize = length(temp_C), 
            se = sd(temp_C)/sqrt(length(temp_C)), 
            low.cl = t.test(temp_C, na.rm = T)$conf[1], 
            hi.cl = t.test(temp_C, na.rm = T)$conf[2])

Show the table output

options(knitr.kable.NA = '') # suppress showing NA values in table
kable(temp.stats, booktabs = T, digits = 4,
      caption = "Summary statistics for temperature (Celsius) for the experimental runs.", 
      align = "rrrrr") %>%
  kable_styling(latex_options = c("striped", "hold_position"))
Summary statistics for temperature (Celsius) for the experimental runs.
target_hum_num mean.temp sampsize se low.cl hi.cl
0 22.7352 32 0.0407 22.6521 22.8182
35 22.8012 32 0.0450 22.7095 22.8929
70 22.8903 32 0.0398 22.8092 22.9714
100 22.6648 32 0.0360 22.5914 22.7382

Vapour pressure deficit

We will also calculate and visualize the corresponding values of Vapour Pressure Deficit.

We’ll use the bigleaf package for the calculations, and the default arguments for the rH.to.VPD function

exp_data$vpd <- bigleaf::rH.to.VPD(exp_data$actual_hum_mean, exp_data$temp_C)
vpd.stats <- dplyr::filter(exp_data, target_hum_num != "NA") %>%
  dplyr::group_by(target_hum_num) %>%
  dplyr::summarize(mean.vpd = mean(vpd, na.rm = T), 
            sampsize = length(vpd), 
            se = sd(vpd)/sqrt(length(vpd)), 
            low.cl = t.test(vpd, na.rm = T)$conf[1], 
            hi.cl = t.test(vpd, na.rm = T)$conf[2])

Show the table output

options(knitr.kable.NA = '') # suppress showing NA values in table
kable(vpd.stats, booktabs = T, digits = 4,
      caption = "Summary statistics for vapour pressure deficit for the experimental runs.", 
      align = "rrrrr") %>%
  kable_styling(latex_options = c("striped", "hold_position"))
Summary statistics for vapour pressure deficit for the experimental runs.
target_hum_num mean.vpd sampsize se low.cl hi.cl
0 2.5335 32 0.0107 2.5117 2.5552
35 1.7751 32 0.0152 1.7440 1.8062
70 0.8037 32 0.0100 0.7834 0.8240
100 0.3184 32 0.0122 0.2935 0.3433

Visualize RH, temp, VPD data

Note that the confidence interval for the group means (n = 32 per target humidity level) are too small to see on a figure (see stats above), so we do not include these in the figure.

Fig_S3a <- ggplot(data = dplyr::filter(exp_data, type == "treatment", time_min != 0), 
            aes(x = as.factor(target_hum_num), y = 100*actual_hum_mean, 
                color = as.factor(time_min))) +
  geom_jitter(position=position_dodge(0.3), size = 3, shape = 1) +
   scale_y_continuous(breaks = seq(0, 100, by = 20), limits = c(0, 100)) +
  theme_bw() +
    labs(y = "Measured relative humidity (%)") +
  theme(axis.title.x = element_blank(), axis.text.x = element_blank()) +
  guides(color = "none") +
geom_point(stat="summary", fun.data = "mean_cl_normal", colour = "black", position = position_nudge(x = 0.25)) + 
  geom_errorbar(stat="summary", fun.data = "mean_cl_normal", colour = "black", width = 0.05, position = position_nudge(x = 0.25)) +
  annotate("text", x = 0.6, y = 97, label = "A")
Fig_S3a
Fig. S3a in manuscript: Measured relative humidity within treatment groups (n = 32 per humidity group, 8 per treatment combination). Black dots represent humidity group means, and bars represent 95% confidence intervals.

Fig. S3a in manuscript: Measured relative humidity within treatment groups (n = 32 per humidity group, 8 per treatment combination). Black dots represent humidity group means, and bars represent 95% confidence intervals.

Now let’s visualize the concurrent temperature data, measured alongside RH

Fig_S3b <- ggplot(data = dplyr::filter(exp_data, type == "treatment", time_min != 0), 
       aes(x = as.factor(target_hum_num), y = temp_C, color = as.factor(time_min))) +
  geom_point(pch = 1, size = 3, position = position_jitterdodge(dodge.width = 0.3, jitter.width = 0, jitter.height = 0.1)) +
   scale_y_continuous(breaks = seq(22.2, 23.4, by = 0.4), limits = c(22.15, 23.45)) +
  theme_bw() +
  labs(y = "Measured temperature (degrees C)") +
  guides(color = guide_legend(title = "Time (min)")) +
    theme(axis.title.x = element_blank(), axis.text.x = element_blank()) +
geom_point(stat="summary", fun.data = "mean_cl_normal", colour = "black", position = position_nudge(x = 0.25)) + 
  geom_errorbar(stat="summary", fun.data = "mean_cl_normal", colour = "black", width = 0.05, position = position_nudge(x = 0.25)) +
  annotate("text", x = 0.6, y = 23.4, label = "B")
Fig_S3b 
Fig. S3b in manuscript: Measured temperature (degrees C) within treatment groups (n = 32 per humidity group, 8 per treatment combination). Black dots represent humidity group means, and bars represent 95% confidence intervals

Fig. S3b in manuscript: Measured temperature (degrees C) within treatment groups (n = 32 per humidity group, 8 per treatment combination). Black dots represent humidity group means, and bars represent 95% confidence intervals

Fig_S3c <- ggplot(data = dplyr::filter(exp_data, type == "treatment", time_min != 0), 
       aes(x = as.factor(target_hum_num), y = vpd, color = as.factor(time_min))) +
  geom_jitter(position=position_dodge(0.3), size = 3, shape = 1) +
  scale_y_continuous(breaks = seq(0, 2.75, by = 0.25), limits = c(0, 2.75)) +
  theme_bw() +
  labs(x = "Target relative humidity (%)", 
       y = "Vapour pressure deficit (kPa)") +
  guides(color = "none") +
geom_point(stat="summary", fun.data = "mean_cl_normal", colour = "black", position = position_nudge(x = 0.25)) + 
  geom_errorbar(stat="summary", fun.data = "mean_cl_normal", colour = "black", width = 0.05, position = position_nudge(x = 0.25)) +
  annotate("text", x = 0.6, y = 2.65, label = "C")
Fig_S3c 
Fig. S3c in manuscript: Vapour pressure deficit within treatment groups (n = 32 per humidity group, 8 per treatment combination). Black dots represent humidity group means, and bars represent 95% confidence intervals.

Fig. S3c in manuscript: Vapour pressure deficit within treatment groups (n = 32 per humidity group, 8 per treatment combination). Black dots represent humidity group means, and bars represent 95% confidence intervals.


Estimating cell density

Here our straightforward goal is to evaluate whether our source diatom culture solution maintained sufficiently high densities throughout the duration of the experiment, such that results of the experiment did not exhibit serial trends.

In order acquire estimates of cell density over time within our source diatom culture, we constructed a calibration curve to relate spectrofluorometry intensity readings to microscopy-based estimates in cell counts. Specifically, three series of two-fold dilutions of the culture (9 dilutions from the source culture, N = 10 per series) were generated.

Transforming count data

For the microscopy cell counts, 10µL was pipetted from the cuvette to a glass slide and covered it with a 22 x 22mm glass coverslip. The diatom cells were counted within the 40x objective lens field of view, at four different locations on the slide. These counts were used to approximate cell count per mL of culture using the formula:

\[{C \cdot A \over 4 \cdot FOV} \cdot 100\]

Where: - C is the diatom count that was tallied across 4 fields of view (these are the count values are reported in the “count” field within the original serial_dilutions CSV file) - A is the area of the coverslip (22mm x 22mm) (we assume the 10µL spread evenly across this surface area) - FOV is the circular area (mm\(^2\)) of a field of view, and we used 40x objective with 10x/22 ocular, so:

The diameter (mm) of the field of view \[ D = {22 \over (40 \cdot 10)} = 0.055 \]

A <- 22*22
D <- 22/(40*10)

And:

\[ FOV = \pi \cdot ({D \over 2 })^2 \\ = \pi \cdot ({0.055 \over 2 })^2 \\ = 2.3575829 \times 10 ^{-3} \]

FOV <- pi*(D/2)^2
  • 4 is because we counted across 4 fields of view
  • 100 is the conversion factor to covert 10µL to 1mL
dilutions$count <- 100*(dilutions$count*A)/(FOV*4)

Calibration

For our work on calibration and prediction of cell density within the diatom solution, we consulted the following online materials: this website http://weightinginbayesianmodels.github.io/poctcalibration/calib_tut4_curve_background.html, and the associated functions at this github website https://github.com/WeightingInBayesianModels/poctcalibration.

And we also made use of the investr package https://github.com/bgreenwell/investr, which provides relevant methods.

Visualize the scatterplot between cell count and spectro intensity first, both raw and log-transformed x values:

cal_curve_scatter <- ggplot(data = dilutions, aes(y = int, x = count)) +
  geom_point(shape = 1, size = 2) + 
  theme_bw() +
    labs(y = "Relative Spectrofluorometer Intensity (arbitrary units)", 
         x = "Diatom cell count (cells/mL)")
cal_curve_scatter
Scatterplot of spectro intensity in relation to diatom cell counts

Scatterplot of spectro intensity in relation to diatom cell counts

Log scale:

cal_curve_scatter_log <- ggplot(data = dilutions, aes(y = int, x = log(count + 1))) +
  geom_point(shape = 1, size = 2) + 
  theme_bw() +
    labs(y = "Relative Spectrofluorometer Intensity (arbitrary units)", 
         x = "Diatom cell count (cells/mL) (log scale)")
#    geom_smooth(method='lm', formula= y~x)
cal_curve_scatter_log
Scatterplot of spectro intensity in relation to diatom cell counts (log)

Scatterplot of spectro intensity in relation to diatom cell counts (log)

From these figures it appears that the curve conforms to the four or four parameter logistic model, here shown in both regular and inverse form:

Fit the calibration model

Starting values for parameter estimation. These were based on trial runs tracing convergence.

Start with these values, and convergence does happen after 30 steps or so

# b = "hill"
# d = "small.x.asymp"
# a = "inf.x.asymp"
# c = "inflec" for the 4pl
# c.5pl = "c.5pl" for the 5pl
# g = "g.5pl" for the 5pl
#
start.vals.4pl <- c(small.x.asymp = 1.2, inf.x.asymp = 120, 
                inflec = 10000, hill = -2)

Fit the calibration model using the minpack.lm package and nlsLM function and M.4pl function defined at top:

cal.curve.4pl <- minpack.lm::nlsLM(int ~ M.4pl(count, small.x.asymp, 
                  inf.x.asymp, inflec, hill),
                       data = dilutions,
                       start = start.vals.4pl)
summary(cal.curve.4pl)
## 
## Formula: int ~ M.4pl(count, small.x.asymp, inf.x.asymp, inflec, hill)
## 
## Parameters:
##                 Estimate Std. Error t value Pr(>|t|)    
## small.x.asymp  2.302e+00  1.734e+00   1.327    0.196    
## inf.x.asymp    1.546e+02  1.834e+01   8.428 6.57e-09 ***
## inflec         3.261e+08  5.929e+07   5.500 9.03e-06 ***
## hill          -1.647e+00  2.409e-01  -6.838 2.94e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.369 on 26 degrees of freedom
## 
## Number of iterations to convergence: 32 
## Achieved convergence tolerance: 1.49e-08

Visualize model and residual plot using the customized version of the plotDiag.nls function from the minpack.lm package (defined at top), the latter shows no issue of heteroscedacity

plotDiag.nls(cal.curve.4pl)

Predict culture cell density

We now have a model for predicting cell density in our culture.

Use the calibration model above to estimate maximum and minimum cell density in our source culture (day 1 versus day 8):

estimated.density.time <- data.frame(day = 1:8, 
            intensity = dplyr::filter(exp_data, type == "control")$culture_int,
          estimated.density = sapply(filter(exp_data, 
            type == "control")$culture_int, function(x){
              Inv.4pl(x, coef(cal.curve.4pl)[1], 
                      coef(cal.curve.4pl)[2], 
                      coef(cal.curve.4pl)[3], 
                      coef(cal.curve.4pl)[4])}))

Show predictions:

estimated.density.time
##   day intensity estimated.density
## 1   1   152.165        4013338214
## 2   2   119.456         677777187
## 3   3   107.158         528033128
## 4   4    94.827         425317316
## 5   5    78.333         325580440
## 6   6    72.850         298270055
## 7   7    67.930         275507191
## 8   8    56.628         228017555

Visualize data with source culture range (dashed) and LOD (solid) indicated:

max.int <- dplyr::filter(estimated.density.time, day == 1)$intensity
max.dens <- dplyr::filter(estimated.density.time, day == 1)$estimated.density
min.int <- dplyr::filter(estimated.density.time, day == 8)$intensity
min.dens <- dplyr::filter(estimated.density.time, day == 8)$estimated.density

Calibration curve figure

Create a figure showing the calibration curve using the plotFit function from the investr package:

investr::plotFit(cal.curve.4pl,
     xlab = "Diatom cell count (cells/mL)",
     ylab = "Relative Spectrofluorometer Intensity (arbitrary units)",
#     col.pred = adjustcolor("lightgrey", 0.5), shade = TRUE,
     pch = 1, ylim = c(-10, 160), las = 1)
segments(-50000000, max.int, 1000000000, max.int, lty = 1, col = "red")
segments(-50000000, min.int,min.dens, min.int, lty = 1, col = "red")
arrows(min.dens, min.int, min.dens, -15, lty = 1, length = 0.15, angle = 20, col = "red")
text(min.dens + 10000, -10, labels = "Predicted count day 8", col = "red", pos = 4, cex = 0.8)
text(-30000, max.int - 8, labels = "Measured intensity day 1", col = "red", pos = 4, cex = 0.8)
text(-30000, min.int + 8, labels = "Measured intensity day 8", col = "red", pos = 4, cex = 0.8)
Fig. S4 in manuscript: Calibration curve for estimating diatom cell density and associated predicted cell densities for cell culture

Fig. S4 in manuscript: Calibration curve for estimating diatom cell density and associated predicted cell densities for cell culture


Calculate Limit of Detection

Because the feathers themselves are a key part of the experimental procedure, we use the feather controls for the calculation of the LOD (N = 8). Here we show the spectrofluorometer data from both the feather and paper controls, and confirm no difference in group means.

Following convention, we use the following calculation of the LOD:

mean + 3 standard deviations

Retrieve spectrofluorometer readings for experimental control from the exp_data dataframe (n = 8), and take mean:

LODs.feather.paper <- paper.vs.regular.controls %>% dplyr::group_by(type) %>% 
  dplyr::summarise(meanval = mean(intensity), LOD = mean(intensity) + 3*sd(intensity))

Visualize control data and LOD

Fig_S5 <- ggplot(data = paper.vs.regular.controls, 
                 aes(x = as.factor(type), y = intensity)) +
  geom_jitter(width = 0.1, size = 3, shape = 1, col = "firebrick") +
  scale_y_continuous(breaks = seq(0, 1.25, by = .25), limits = c(-0.005, 1.26)) +
  theme_bw() +
geom_point(stat="summary", fun.data = "mean_se", shape = 16, size = 2, colour = "black", position = position_nudge(x = 0.15)) + 
  geom_errorbar(stat="summary", fun.data = "mean_se", colour = "black", lwd = 0.75, width = 0.05, position = position_nudge(x = 0.15)) +
  labs(x = "Control group", 
       y = "Relative Spectrofluorometer Intensity (arbitrary units)") +
  geom_hline(yintercept = filter(LODs.feather.paper, 
            type == "Feather")$LOD, linetype = "dashed") 
Fig_S5
Fig. S5 in manuscript: Spectrofluorometer intensity readings for control groups, showing LOD. Black points represent group means, and bars extent to +/- one standard error.

Fig. S5 in manuscript: Spectrofluorometer intensity readings for control groups, showing LOD. Black points represent group means, and bars extent to +/- one standard error.


Main statistical analysis

Data preparation

Create dataset for modeling the main experiment data. Filter out control data (which were analyzed above), then make a new variable target humidity coded as an ordered factor.

final.glm.data <- dplyr::filter(exp_data, time_min != 0)
final.glm.data$target_hum_factor <- as.character(final.glm.data$target_hum_factor)
final.glm.data$target_hum_factor <- factor(final.glm.data$target_hum_factor, 
                levels = c("High", "Mid_High", "Mid_Low","Dry"))

Create binary “success” variable, where success is spectro intensity readings greater than our calculated LOD (above)

final.glm.data$success <- as.numeric(
  final.glm.data$int_2wk > dplyr::filter(LODs.feather.paper, type == "Feather")$LOD
  )

Create numeric time variable

final.glm.data$time_min_num <- as.numeric(final.glm.data$time_min)

Re-scale the humidity variable so that it is expressed in percent

final.glm.data$scaled.hum <- 100*final.glm.data$actual_hum_mean

Construct GLM

Reviewers of the first version of the manuscript pointed out that vapour pressure deficit (VPD) is more meaningful than relative humidity (RH), so we’ll repeat all the main analyses below using VPD in lieu of RH. These new analyses appear after the original ones.

Model using RH

The full model including interaction term is the appropriate model given the factorial experiment.

glm.full <- glm(data = final.glm.data, success ~ scaled.hum * time_min_num, family = "binomial")

Get model output

summary(glm.full)
## 
## Call:
## glm(formula = success ~ scaled.hum * time_min_num, family = "binomial", 
##     data = final.glm.data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.00101  -0.25909  -0.02339   0.52884   2.16979  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)   
## (Intercept)             -3.4202972  1.6297249  -2.099  0.03584 * 
## scaled.hum               0.0587368  0.0217283   2.703  0.00687 **
## time_min_num            -0.0895784  0.0404624  -2.214  0.02684 * 
## scaled.hum:time_min_num  0.0009251  0.0004626   2.000  0.04553 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.114  on 127  degrees of freedom
## Residual deviance:  71.502  on 124  degrees of freedom
## AIC: 79.502
## 
## Number of Fisher Scoring iterations: 8

Get confidence intervals on coefficients:

confint(glm.full)
## Waiting for profiling to be done...
##                                 2.5 %       97.5 %
## (Intercept)             -7.3554860269 -0.717267069
## scaled.hum               0.0221713102  0.110171403
## time_min_num            -0.1848864629 -0.022631338
## scaled.hum:time_min_num  0.0001427493  0.002004403

For comparison, use the likelihood ratio test approach:

glm.null <- glm(data = final.glm.data, success ~ 1, family = "binomial")
glm.hum <- glm(data = final.glm.data, success ~ scaled.hum, family = "binomial")
glm.hum_time <- glm(data = final.glm.data, success ~ scaled.hum + time_min_num, family = "binomial")

Model comparisons:

anova(glm.null, glm.hum, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: success ~ 1
## Model 2: success ~ scaled.hum
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       127    146.114                          
## 2       126     88.829  1   57.285 3.771e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(glm.hum, glm.hum_time, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: success ~ scaled.hum
## Model 2: success ~ scaled.hum + time_min_num
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1       126     88.829                         
## 2       125     77.155  1   11.674 0.000634 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(glm.hum_time, glm.full, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: success ~ scaled.hum + time_min_num
## Model 2: success ~ scaled.hum * time_min_num
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       125     77.155                       
## 2       124     71.502  1   5.6533  0.01742 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Calculate “pseudo-R-square” values using the pR2 function in the pscl package:

pscl::pR2(glm.full)
##         llh     llhNull          G2    McFadden        r2ML        r2CU 
## -35.7510184 -73.0568196  74.6116024   0.5106409   0.4417247   0.6489611

Model using VPD (used for geospatial predictions)

The full model including interaction term is the appropriate model given the factorial experiment.

glm.full.vpd <- glm(data = final.glm.data, success ~ vpd * time_min_num, family = "binomial")

Get model output

summary(glm.full.vpd)
## 
## Call:
## glm(formula = success ~ vpd * time_min_num, family = "binomial", 
##     data = final.glm.data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.00389  -0.26468  -0.02623   0.52777   2.20346  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)   
## (Intercept)       2.448012   0.765232   3.199  0.00138 **
## vpd              -2.111239   0.776629  -2.718  0.00656 **
## time_min_num      0.002575   0.007195   0.358  0.72047   
## vpd:time_min_num -0.032695   0.016455  -1.987  0.04692 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 146.114  on 127  degrees of freedom
## Residual deviance:  71.713  on 124  degrees of freedom
## AIC: 79.713
## 
## Number of Fisher Scoring iterations: 8

Get confidence intervals on coefficients:

confint(glm.full.vpd)
## Waiting for profiling to be done...
##                        2.5 %       97.5 %
## (Intercept)       1.05849016  4.098242055
## vpd              -3.94683956 -0.802387251
## time_min_num     -0.01131228  0.017634614
## vpd:time_min_num -0.07113824 -0.004893459

For comparison, use the likelihood ratio test approach:

glm.null.vpd <- glm(data = final.glm.data, success ~ 1, family = "binomial")
glm.vpd <- glm(data = final.glm.data, success ~ vpd, family = "binomial")
glm.vpd_time <- glm(data = final.glm.data, success ~ vpd + time_min_num, family = "binomial")

Model comparisons:

anova(glm.null.vpd, glm.vpd, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: success ~ 1
## Model 2: success ~ vpd
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       127     146.11                          
## 2       126      89.01  1   57.104 4.134e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(glm.vpd, glm.vpd_time, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: success ~ vpd
## Model 2: success ~ vpd + time_min_num
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       126     89.010                          
## 2       125     77.296  1   11.713 0.0006206 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(glm.vpd_time, glm.full.vpd, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: success ~ vpd + time_min_num
## Model 2: success ~ vpd * time_min_num
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       125     77.296                       
## 2       124     71.713  1   5.5836  0.01813 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Calculate “pseudo-R-square” values using the pR2 function in the pscl package:

pscl::pR2(glm.full.vpd)
##         llh     llhNull          G2    McFadden        r2ML        r2CU 
## -35.8563993 -73.0568196  74.4008405   0.5091985   0.4408047   0.6476095

Note that the two approaches (RH versus VPD) yield qualitatively identical results, and quantitatively very similar results.


Visualize the results

First we’ll look at the interaction plot of the main experiment results.

Here we use the ggeffects package and its ggpredict function, which provides predictions for the focal variable holding the others constant. It also provides 95% confidence bands:

We’ll generate predictions using the original “time” (= “duration”) values in the experiment, and the actual (measured) values of mean relative humidity per group.

We use the ggpredict function from the ggeffects package.

full.glm.ggpredict <- ggeffects::ggpredict(glm.full, 
      terms = c("time_min_num [10, 60, 120, 240]", "scaled.hum [8, 36, 71, 88]"))
full.glm.ggpredict
## 
## # Predicted probabilities of success 
## # x = time_min_num 
## 
## # scaled.hum = 36
##    x predicted std.error conf.low conf.high
##   10     0.134     0.795    0.031     0.423
##   60     0.009     1.182    0.001     0.086
##  120     0.000     2.470    0.000     0.039
##  240     0.000     5.283    0.000     0.011
## 
## # scaled.hum = 71
##    x predicted std.error conf.low conf.high
##   10     0.625     0.415    0.425     0.790
##   60     0.335     0.408    0.185     0.529
##  120     0.107     0.796    0.025     0.364
##  240     0.007     1.768    0.000     0.179
## 
## # scaled.hum = 8
##    x predicted std.error conf.low conf.high
##   10     0.022     1.307    0.002     0.230
##   60     0.000     1.894    0.000     0.015
##  120     0.000     3.844    0.000     0.005
##  240     0.000     8.147    0.000     0.001
## 
## # scaled.hum = 88
##    x predicted std.error conf.low conf.high
##   10     0.841     0.550    0.643     0.940
##   60     0.779     0.417    0.608     0.889
##  120     0.683     0.366    0.513     0.816
##  240     0.447     0.656    0.183     0.745
## Standard errors are on link-scale (untransformed).

And using VPD (using the mean values of VPD per target humidity group; see Table 3 above):

Note that we’ll need to reverse the ordering of the VPD values after, to correspond with order of increasing RH values:

full.glm.vpd.ggpredict <- ggeffects::ggpredict(glm.full.vpd, 
      terms = c("time_min_num [10, 60, 120, 240]", "vpd [0.32, 0.80, 1.78, 2.53]"))
full.glm.vpd.ggpredict
## 
## # Predicted probabilities of success 
## # x = time_min_num 
## 
## # vpd = 0.32
##    x predicted std.error conf.low conf.high
##   10     0.845     0.556    0.647     0.942
##   60     0.786     0.423    0.616     0.894
##  120     0.695     0.372    0.524     0.826
##  240     0.470     0.660    0.195     0.764
## 
## # vpd = 0.8
##    x predicted std.error conf.low conf.high
##   10     0.628     0.415    0.428     0.792
##   60     0.342     0.404    0.190     0.534
##  120     0.112     0.784    0.026     0.370
##  240     0.007     1.741    0.000     0.184
## 
## # vpd = 1.78
##    x predicted std.error conf.low conf.high
##   10     0.134     0.791    0.032     0.422
##   60     0.009     1.171    0.001     0.087
##  120     0.000     2.448    0.000     0.040
##  240     0.000     5.238    0.000     0.012
## 
## # vpd = 2.53
##    x predicted std.error conf.low conf.high
##   10     0.024     1.280    0.002     0.234
##   60     0.000     1.849    0.000     0.017
##  120     0.000     3.756    0.000     0.006
##  240     0.000     7.965    0.000     0.001
## Standard errors are on link-scale (untransformed).

Now let’s reverse the ordering of the VPD levels:

full.glm.vpd.ggpredict$group <- factor(full.glm.vpd.ggpredict$group, levels = rev(levels(full.glm.vpd.ggpredict$group)))

Now we can visualize these predictions, first using RH, then using VPD:

labels.facet <- c("Mean RH = 8%", "Mean RH = 36%", "Mean RH = 71%", "Mean RH = 88%")
names(labels.facet) <- c("8", "36", "71", "88")
model.plot <- ggplot(full.glm.ggpredict, aes(x = x, y = predicted)) +
  geom_point(position = position_dodge(.2), shape = 1, size = 2.5) +
  theme_bw() +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 2) +
  labs(x = "Exposure time (minutes)", y = "Probability of diatoms being viable") +
  ylim(0, 1) +
  scale_x_continuous(breaks = c(10, 60, 120, 240), limits = c(-10, 260)) +
  theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
          panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
          panel.grid.minor.x = element_blank(), 
         panel.grid.minor.y = element_blank(),
          panel.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA)) +
  facet_grid(. ~ group, labeller = labeller(group = labels.facet))
model.plot
Fig_02 in manuscript. Main experimental results showing probability of diatoms remaining viable in relation to the experimental treatments.  Points represent treatment means, and error bars represent 95% confidence bands calculated using default settings in the `ggpredict` package.

Fig_02 in manuscript. Main experimental results showing probability of diatoms remaining viable in relation to the experimental treatments. Points represent treatment means, and error bars represent 95% confidence bands calculated using default settings in the ggpredict package.

Now VPD:

labels.facet <- c("Mean VPD = 2.53", "Mean VPD = 1.78", "Mean VPD = 0.80", "Mean VPD = 0.32")
names(labels.facet) <- c("2.53", "1.78", "0.8", "0.32")
Fig_S6_vpd.glm.plot <- ggplot(full.glm.vpd.ggpredict, aes(x = x, y = predicted)) +
  geom_point(position = position_dodge(.2), shape = 1, size = 2.5) +
  theme_bw() +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 2) +
  labs(x = "Exposure time (minutes)", y = "Probability of diatoms being viable") +
  ylim(0, 1) +
  scale_x_continuous(breaks = c(10, 60, 120, 240), limits = c(-10, 260)) +
  theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
          panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
          panel.grid.minor.x = element_blank(), 
         panel.grid.minor.y = element_blank(),
          panel.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA)) +
  facet_grid(. ~ group, labeller = labeller(group = labels.facet))
Fig_S6_vpd.glm.plot
Main experimental results showing probability of diatoms remaining viable in relation to the experimental treatments.  Points represent treatment means, and error bars represent 95% confidence bands calculated using default settings in the `ggpredict` package.

Main experimental results showing probability of diatoms remaining viable in relation to the experimental treatments. Points represent treatment means, and error bars represent 95% confidence bands calculated using default settings in the ggpredict package.

Predict across RH, VPD and distance

Create a dataframe with RH values from 75 to 95% (which are based on analyses of real RH data from study region; see below), VPD values of 0.1 to 0.5 (also typical of study region), and average flight speed of 69 km/h (see manuscript), and exposure times from 10 to 240 minutes. Then use our GLM model to predict distances travelled.

disp.distances <- expand.grid(time_min_num = 10:240, 
                  vpd = seq(from = 0.25, to = 1.25, length = 5), flight.speed  = 69)
disp.distances$predicted.prob.vpd <- round(predict(glm.full.vpd, 
                      newdata = disp.distances, type = "response"), 4)
disp.distances$dist <- disp.distances$time_min_num * disp.distances$flight.speed / 60

For plotting, create a factor variable for VPD, and re-order levels such that VPD ordered high values to low values:

disp.distances$vpd.factor <- as.factor(disp.distances$vpd)
disp.distances$vpd.factor <- ordered(disp.distances$vpd.factor, levels = rev(as.character(seq(from = 0.25, to = 1.25, length = 5))))

Visualize probability as a function of distance for different values of VPD, using average flight speed of 69 km/h.

Fig3A.dist.prob.vpd <- ggplot(data = disp.distances, aes(y = predicted.prob.vpd, 
                  x = dist, group = vpd.factor)) +
  scale_y_continuous(breaks = seq(0, 1, by = 0.1)) +
  scale_x_continuous(breaks = seq(0, 320, by = 40)) + 
  geom_line(aes(linetype = vpd.factor), size = 0.8) +
  scale_linetype_manual(values = 5:1, labels = rev(as.character(seq(from = 0.25, to = 1.25, length = 5))), 
                      name = "Vapour pressure deficit (kPa)") +
  theme(legend.background = element_rect(colour = 'black'), 
        legend.key = element_rect(fill = 'white'), 
        legend.position = "top", legend.key.size = unit(1, "cm"), 
        legend.title = element_text(size = 9), 
        legend.text = element_text(size = 7)) +
    guides(linetype = guide_legend(nrow = 1, reverse = T, 
                 title.position = "top", title.hjust = 0.5, title.vjust = - 1)) +
theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
          panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
           panel.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA)) +
  labs(x = "Distance travelled by mallard (km)", y = "Probability of diatoms remaining viable") +
  theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_text(size = 7)) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_text(size = 9)) +
  annotate("text", x = 290, y = 0.95, label = "A")
Fig3A.dist.prob.vpd
Probability of being viable as a function of vapour pressure deficit and flight distance. Predictions assume average flight speed of 69 km/h.

Probability of being viable as a function of vapour pressure deficit and flight distance. Predictions assume average flight speed of 69 km/h.


Ordinal factor GLM

Here we report the results from coding the time and humidity predictors as ordinal factors, instead of continuous numeric variables.

We conduct likelihood ratio tests (LRTs) to compare models that use ordinal factors.

Create ordinal factor variables:

final.glm.data$time_min_fact <- as.factor(final.glm.data$time_min)

Build models using treatment contrasts:

glm.full.fact <- glm(data = final.glm.data, success ~ target_hum_factor * time_min_fact, 
                     family = "binomial")
glm.null.fact <- glm(data = final.glm.data, success ~ 1, family = "binomial")
glm.hum.fact <- glm(data = final.glm.data, success ~ target_hum_factor, 
                    family = "binomial")
glm.hum_time.fact <- glm(data = final.glm.data, success ~ target_hum_factor + time_min_fact, 
                         family = "binomial")

Now compare fits using LRTs:

anova(glm.null.fact, glm.hum.fact, glm.hum_time.fact, glm.full.fact, test = "LRT")
## Analysis of Deviance Table
## 
## Model 1: success ~ 1
## Model 2: success ~ target_hum_factor
## Model 3: success ~ target_hum_factor + time_min_fact
## Model 4: success ~ target_hum_factor * time_min_fact
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       127    146.114                          
## 2       124     88.976  3   57.137 2.402e-12 ***
## 3       121     73.135  3   15.842  0.001222 ** 
## 4       112     57.665  9   15.470  0.078813 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We see that the full model that includes the interaction between humidity and time gains slightly less support than it did when coding the predictors as numeric. This is to be expected because in this approach the model uses a degree of freedom for each level (minus 1) for each term.

NOTE There is no need to repeat the preceding analysis with VPD, because the ordinal factor “target_hum_factor” would serve identically.

Aspatial and spatial predictions

With the aim of placing our experiment results into real-world contexts we sought relevant data pertaining to North and South Dakota and Nebraska. This region encompasses the prairie pothole region of the USA, an important breeding ground, and is also within the central migration flyway. We sought information about the following:

  • relative humidity and air temperature throughout the study region and during the key months
  • distribution and flight data for mallard ducks
  • distribution of surface waters (potential habitat for ducks and diatoms)
  • distribution of the genus Nitzschia and of the species Nitzschia pussella in surface waters

With the relative humidity and air temperature data, we could calculate corresponding values of vapour pressure deficit.

When combined, this information can help predict the potential for mallard-borne diatom dispersal within region.


Data sources


Import data

Relative humidity data

Here we access online data about relative humidity and air temperature within the states of interest, and during the spring months corresponding to arrival of ducks into nesting areas (April) or during stopovers for continued migration north, and during nesting times (through end of May).

We will use the riem package, described here: https://docs.ropensci.org/riem/

We will retrieve relative humidity and air temperature data for the months of April through May inclusive for each state.

Get the ASOS networks (state-level) and stations (airports) of interest.

First, we know from the online vignette https://docs.ropensci.org/riem/articles/riem.html that the networks have a “code” that is simply the state abbreviation appended to “ASOS” with an underscore:

network.codes <- paste(c("ND", "SD", "NE"), "ASOS", sep = "_")

So, for our states of interest, we can get the station codes as follows:

asos.stations <- riem::riem_stations(network = network.codes[1])
asos.stations$state = unlist(strsplit(network.codes[1], "_"))[1]
for (i in 2:length(network.codes)) {
  temp <- riem::riem_stations(network = network.codes[i])
  temp$state <- unlist(strsplit(network.codes[i], "_"))[1]
  asos.stations <- bind_rows(asos.stations, temp)
}

Set the date values (we’ll use the last six years of data)

data.years <- seq(2015, 2020, by = 1)
start.mo.day <- "04-01"
end.mo.day <- "05-30"

The field name for Relative Humidity (%) is “relh”, and temperature is “tmpf” (Farenheit), and we can now download these data for the correct years of interest, then we’ll subset and average the months of interest (April through May inclusive).

First s Loop through downloading for years and stations:

NOTE This takes > 10 minutes, so i’ve turned “eval = F”; change this to “eval = T” if you wish.

for (j in 1:length(data.years)) {
  for (i in 1:length(asos.stations$id)) {
    if (i == 1 & j == 1) {
    asos.data <- as.data.frame(riem_measures(
      station = asos.stations$id[i], 
      date_start = paste(as.character(data.years[j]), start.mo.day, sep = "-"), 
      date_end = paste(as.character(data.years[j]), 
                       end.mo.day, sep = "-")))[,c("station", "valid", "relh", "tmpf")] } else {
      asos.data <- as.data.frame(rbind(asos.data, 
            as.data.frame(riem_measures(station = asos.stations$id[i], 
                date_start = paste(as.character(data.years[j]), 
                  start.mo.day, sep = "-"), date_end = paste(as.character(data.years[j]), 
                    end.mo.day, sep = "-")))[,c("station", "valid", "relh", "tmpf")]))
    }
  }
  print(paste("done j loop", j, sep = " "))
}

Write out file for future use; here “eval = F”; change “eval = T” to run.

asos.data <- dplyr::filter(asos.data, relh != "null")
write.csv(asos.data, "../rawdata/ASOS_data_from_web.csv")

Mallard flight data

Import data from Viana et al. (2013) from Dryad https://datadryad.org/resource/doi:10.5061/dryad.619gd and create new dataframe containing only relevant records (mallards, North America)

NOTE: on the Dryad page linked above, you can find out the correct URL address for the files of interest by clicking on the “data files” link on the right of the webpage, then right-clicking the file name and “copy link address”

I was unable to derive this information directly using rdryad functions, perhaps because this is an older 2013 repository page that was not structured properly? In any case, the code below is reproducible.

# banding data file
rdryad::dryad_fetch(url = 'https://datadryad.org/stash/downloads/file_stream/39802',
                (tempfile1 <- tempfile(fileext = ".txt")))
## $`https://datadryad.org/stash/downloads/file_stream/39802`
## [1] "/var/folders/xd/lp87q1k11r1d8tbk952jkf4m0000gp/T//Rtmp7TedKg/file730e2d066613.txt"
file.copy(tempfile1, "../rawdata/banding_data.txt")
## [1] FALSE
# IF YOU WANT, download readme file for banding data
rdryad::dryad_fetch(url = 'https://datadryad.org/stash/downloads/file_stream/39803', 
                (tempfile2 <- tempfile(fileext = ".rtf")))
## $`https://datadryad.org/stash/downloads/file_stream/39803`
## [1] "/var/folders/xd/lp87q1k11r1d8tbk952jkf4m0000gp/T//Rtmp7TedKg/file730e7f17807c.rtf"
file.copy(tempfile2, "../rawdata/banding_data_readme.rtf")
## [1] FALSE
banding.data <- read.delim("../rawdata/banding_data.txt")
mallard.data <- filter(banding.data, Region == "North America" & Species == "Anas platyrhynchos")

Surface waters and reference maps

Import administrative boundaries, lakes, and flyways data using the sf package function st_read:

borders <- sf::st_read(
  "../rawdata/spatial/admin_shapes/ne_10m_admin_1_states_provinces_lakes.shp")
## Reading layer `ne_10m_admin_1_states_provinces_lakes' from data source `/Users/jpither/OneDrive - The University Of British Columbia/work/students/faye/diatom/rawdata/spatial/admin_shapes/ne_10m_admin_1_states_provinces_lakes.shp' using driver `ESRI Shapefile'
## Simple feature collection with 4593 features and 83 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -180 ymin: -90 xmax: 180 ymax: 83.6341
## CRS:            4326
small.lakes <- sf::st_read("../rawdata/spatial/glwd_2.shp") # small lakes
## Reading layer `glwd_2' from data source `/Users/jpither/OneDrive - The University Of British Columbia/work/students/faye/diatom/rawdata/spatial/glwd_2.shp' using driver `ESRI Shapefile'
## Simple feature collection with 244892 features and 7 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: -180 ymin: -55.58721 xmax: 180 ymax: 83.57595
## CRS:            4269
large.lakes <- sf::st_read("../rawdata/spatial/glwd_1.shp") # large lakes; missing CRS, so assign
## Reading layer `glwd_1' from data source `/Users/jpither/OneDrive - The University Of British Columbia/work/students/faye/diatom/rawdata/spatial/glwd_1.shp' using driver `ESRI Shapefile'
## Simple feature collection with 3721 features and 29 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -178.8711 ymin: -54.6059 xmax: 179.995 ymax: 81.98179
## CRS:            NA

Map wrangling

Large lakes dataset missing coordinate system definition, so assign using small lakes dataset:

st_crs(large.lakes) <- sf::st_crs(small.lakes)

Extract USA, Canada, and Mexico eliminate extra fields, and put borders in same coordinate system as lakes

north.america <- dplyr::filter(borders, 
                  admin == "United States of America" | admin == "Canada" | admin == "Mexico")
north.america <- north.america[, c(1:60, 84)]
north.america <- sf::st_transform(north.america, st_crs(small.lakes))

Simplify borders to save space using the ms_simplify function from the rmapshaper package:

north.america <- rmapshaper::ms_simplify(north.america, keep = 0.1)

And exclude Hawaii and Aleutian islands:

north.america <- dplyr::filter(north.america, name != "Hawaii")
north.america <- sf::st_crop(north.america, xmin = -180, xmax = 0, ymin = -90, ymax = 90)
## although coordinates are longitude/latitude, st_intersection assumes that they are planar
## Warning: attribute variables are assumed to be spatially constant
## throughout all geometries

Re-project North America to Lambert Conformal Conic for creating an overview map:

north.america.lamb <- sf::st_transform(north.america, 
      crs = "+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80")

Make outer border map layer for overview map:

north.america.lamb.outer <- sf::st_union(north.america.lamb)

Now create map of just the focal states:

borders.usa <- dplyr::filter(north.america, admin == "United States of America")
study.area.us <- dplyr::filter(borders.usa, 
          name == "North Dakota" | name == "South Dakota" | name == "Nebraska")

Create outer border layer of study region

study.area.outer.border <- sf::st_union(study.area.us)

Crop the lakes to study region:

temp <- sf::st_covered_by(small.lakes, study.area.outer.border, sparse = F)
## although coordinates are longitude/latitude, st_covered_by assumes that they are planar
small.lakes.central <- small.lakes[apply(temp,1,function(x){sum(x)}) > 0,]

temp <- sf::st_covered_by(large.lakes, study.area.outer.border, sparse = F)
## although coordinates are longitude/latitude, st_covered_by assumes that they are planar
large.lakes.central <- large.lakes[apply(temp,1,function(x){sum(x)}) > 0,]

Combine all lakes together in one layer

study.area.lakes <- rbind(small.lakes.central[,intersect(names(small.lakes.central), 
                      names(large.lakes.central))], 
                      large.lakes.central[,intersect(names(small.lakes.central), 
                      names(large.lakes.central))])
rm(temp) # remove temporary layer

Make coordinate system consistent among datasets, using a projection that is appropriate for distance calculations:

Put all datasets into north america equidistant conic (see https://spatialreference.org/ref/esri/102010/):

study.area.lakes.projected <- sf::st_transform(study.area.lakes, 102010)
study.area.us.projected <- sf::st_transform(study.area.us, 102010)
study.area.outer.border.projected <- sf::st_transform(study.area.outer.border, 102010)

Calculate distance between nearest-neighbour waters

Now get pairwise distances among all lakes in study area and its nearest neighbour lake, using the spatstat package function nndist:

# gets distances between focal lake and its nearest neighbour
temp <- sf::st_coordinates(st_centroid(study.area.lakes.projected))
## Warning in st_centroid.sf(study.area.lakes.projected): st_centroid assumes
## attributes are constant over geometries of x
study.area.lakes.projected$min.nbr.distance.km <- spatstat::nndist(temp) /1000

Plot histogram of nearest neigbour distances:

neighbour.dist.hist <- ggplot(data = study.area.lakes.projected, aes(x = min.nbr.distance.km)) + 
  theme_bw() + 
 # geom_density() +
  geom_histogram(col = "black", fill = "darkgrey", alpha = .1, breaks = seq(0, 100, by = 10)) +
  scale_x_continuous(breaks = seq(0, 100, by = 10)) +
  theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
          panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(x = "Distance between nearest \nneighbour water bodies (km)", y = "Number of water bodies") +
  annotate("text", x = 100, y = 1050, label = "C") +
    theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_text(size = 7)) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_text(size = 9))
neighbour.dist.hist
Fig. 3C in manuscript: Histogram of distances between surface water bodies and their nearest neighbours (N = 1252 water bodies)

Fig. 3C in manuscript: Histogram of distances between surface water bodies and their nearest neighbours (N = 1252 water bodies)


Calculating VPD from RH and temperature

Format date field. Make use of the lubridate package.

Note that the raw data provide times in UTC, so these need to be converted to Central Daylight Time (appropriate for the region and the months of April and May).

asos.data <- read.csv("../rawdata/ASOS_data_from_web.csv", header = T, row.names = 1)
asos.data$valid <- ymd_hms(asos.data$valid, tz = "UTC")
asos.data$date <- base::as.Date(strptime(asos.data$valid,"%Y-%m-%d %H:%M"))
asos.data$hour.of.day <- lubridate::hour(asos.data$valid)
## change to Central Daylight Time (for April - May in the region)
## NOTE: the "with_tz" function knows to adjust Daylight Savings based on the date
asos.data$cdt.date <- lubridate::date(with_tz(asos.data$valid, tz = "US/Central"))
asos.data$cdt.hour.of.day <- lubridate::hour(with_tz(asos.data$valid, tz = "US/Central"))
# get month and day
#asos.data$month.day <- format(as.Date(asos.data$cdt.date), "%m-%d")

Join data to station info:

asos.data <- dplyr::left_join(asos.data, asos.stations, by = c("station" = "id"))
## Warning: Column `station`/`id` joining factor and character vector,
## coercing into character vector

We will use three contrasting times of day for illustration: the early morning hours that tend to correspond to high flight activity and minimum daily VPD, mid-day, and evening.

First convert Farenheit to Celsius

asos.data$temp_C <- (5/9)*(asos.data$tmpf - 32)

First filter to include morning 4am to 7am times, mid-day (noon to 3pm), and dusk (8 to 11pm):

asos.data.dawn.pm <- dplyr::filter(asos.data, cdt.hour.of.day >= 4 & cdt.hour.of.day <= 7 | cdt.hour.of.day >= 12 & cdt.hour.of.day <= 15 | cdt.hour.of.day >= 20 & cdt.hour.of.day <= 23)
asos.data.dawn.pm$week <- lubridate::week(asos.data.dawn.pm$date)

Now let’s get basic descriptive stats for guiding further analyses.

How many stations per state?

stations.per.state <- lapply(sapply(unique(asos.data.dawn.pm$state), 
    function(x){table(asos.data.dawn.pm[asos.data.dawn.pm$state == x, "station"])}), length)
stations.per.state
## $ND
## [1] 37
## 
## $SD
## [1] 21
## 
## $NE
## [1] 40

Now calculate vapour pressure deficit values based on RH and temperature, again using the bigleaf R package

asos.data.dawn.pm$vpd <- bigleaf::rH.to.VPD(asos.data.dawn.pm$relh/100, asos.data.dawn.pm$temp_C)

Create grouping variable for different times of day

asos.data.dawn.pm$time_of_day_group <- factor(ifelse(asos.data.dawn.pm$cdt.hour.of.day < 10, "Dawn", ifelse(asos.data.dawn.pm$cdt.hour.of.day > 16, "Dusk", "Mid-day")), levels = c("Dawn", "Mid-day", "Dusk"))

Calculate average RH and temperature grouped by weather station, time of day, and week:

asos.data.dawn.pm.mean.station.week <- asos.data.dawn.pm %>% 
  dplyr::group_by(station, time_of_day_group, week) %>% 
  dplyr::summarise(mean.rh = mean(relh, na.rm = T),
                   sampsize.rh = n(),
                   mean.tempc = mean(temp_C, na.rm = T),
                   sampsize.tempc = n(),
                   mean.vpd = mean(vpd, na.rm = T),
                   sampsize.vpd = n())

asos.data.dawn.pm.mean.state.week <- asos.data.dawn.pm %>% 
  dplyr::group_by(state, time_of_day_group, week) %>% 
  dplyr::summarise(mean.rh = mean(relh, na.rm = T),
                   sampsize.rh = n(),
                   mean.tempc = mean(temp_C, na.rm = T),
                   sampsize.tempc = n(),
                   mean.vpd = mean(vpd, na.rm = T),
                   sampsize.vpd = n())

And, let’s get basic descriptive stats on the RH, temperature, and VPD values from April 1 to May 31, from 2015 to 2020:

asos.data.descriptive.stats <- asos.data.dawn.pm %>%
dplyr::select(time_of_day_group, relh, temp_C, vpd) %>%
tidyr::pivot_longer(cols = c("relh", "temp_C", "vpd"), names_to = "variable", values_to = "value") %>%
  dplyr::group_by(time_of_day_group, variable) %>% 
  dplyr::summarise(mean = mean(value, na.rm = T),
                   median = median(value, na.rm = T),
                   q1 = quantile(value, na.rm = T)[2],
                   q3 = quantile(value, na.rm = T)[4],
                   sampsize = n())

Now view the output Show the table output

# transpose table
options(knitr.kable.NA = '') # suppress showing NA values in table
kable(asos.data.descriptive.stats, booktabs = T, digits = 4,
      caption = "Summary statistics for weather variables for April and May, years 2015 to 2020 inclusive", 
      align = "rrrrr") %>%
  kable_styling(latex_options = c("striped", "hold_position"))
Summary statistics for weather variables for April and May, years 2015 to 2020 inclusive
time_of_day_group variable mean median q1 q3 sampsize
Dawn relh 83.6388 86.5900 74.9300 95.1300 276392
Dawn temp_C 5.6795 6.0000 1.1111 10.5000 276392
Dawn vpd 0.1702 0.1106 0.0392 0.2434 276392
Mid-day relh 52.8752 48.0000 32.8300 71.2800 272104
Mid-day temp_C 14.3294 15.0000 9.0000 20.2778 272104
Mid-day vpd 0.9688 0.8404 0.3174 1.4489 272104
Dusk relh 68.3261 69.0400 52.0500 86.1300 270493
Dusk temp_C 9.7199 10.0000 5.0000 15.0000 270493
Dusk vpd 0.4665 0.3489 0.1290 0.6901 270493

So, the first and third quartiles for the VPD data are useful reference points for exploration below (along with the timeplot below).

Create time series plot of air temperature.

labels.facet <- c("Dawn (04:00 to 07:00)", "Mid-day (12:00 - 15:00)", "Dusk (20:00 to 23:00)")
names(labels.facet) <- c("Dawn", "Mid-day", "Dusk")
Fig_S7.temp.timeplot <- ggplot(data = asos.data.dawn.pm.mean.state.week, 
               aes(x = week, y = mean.tempc, col = state)) +
  guides(col = guide_legend(title = "State")) +
  geom_line() +
  theme_bw() +
  labs(tag = "A") +
    theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
          panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
        panel.grid.minor.x = element_blank()) +
    #ylim(0, 1.5) +
  theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_text(size = 7)) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_text(size = 9)) +
  scale_x_continuous(breaks = 13:22) +
    labs(x = "Weeks from January 1", y = "Average air temperature (degrees Celsius)") 
Fig_S7.temp.timeplot <- Fig_S7.temp.timeplot + facet_grid(. ~ time_of_day_group, labeller = labeller(time_of_day_group = labels.facet)) 

Fig_S7.temp.timeplot
Air temperature (degrees Celsius) for three times of day, averaged from April 1 to May 31 over the years 2015 to 2020 inclusive. Week 13 corresponds to first week of April.

Air temperature (degrees Celsius) for three times of day, averaged from April 1 to May 31 over the years 2015 to 2020 inclusive. Week 13 corresponds to first week of April.

Create time series plot of relative humidity.

labels.facet <- c("Dawn (04:00 to 07:00)", "Mid-day (12:00 - 15:00)", "Dusk (20:00 to 23:00)")
names(labels.facet) <- c("Dawn", "Mid-day", "Dusk")
Fig_S7.rh.timeplot <- ggplot(data = asos.data.dawn.pm.mean.state.week, 
               aes(x = week, y = mean.rh, col = state)) +
  guides(col = guide_legend(title = "State")) +
  geom_line() +
  theme_bw() +
  labs(tag = "B") +
    theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
          panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
        panel.grid.minor.x = element_blank()) +
  #  ylim(0, 1.5) +
  theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_text(size = 7)) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_text(size = 9)) +
  scale_x_continuous(breaks = 13:22) +
    labs(x = "Weeks from January 1", y = "Average relative humidity (%)") 
Fig_S7.rh.timeplot <- Fig_S7.rh.timeplot + facet_grid(. ~ time_of_day_group, labeller = labeller(time_of_day_group = labels.facet)) 

Fig_S7.rh.timeplot
Relative humidity (%) at three different times of day, averaged from April 1 to May 31 over the years 2015 to 2020 inclusive. Week 13 corresponds to first week of April.

Relative humidity (%) at three different times of day, averaged from April 1 to May 31 over the years 2015 to 2020 inclusive. Week 13 corresponds to first week of April.

Create time series plot of vapour pressure deficit.

labels.facet <- c("Dawn (04:00 to 07:00)", "Mid-day (12:00 - 15:00)", "Dusk (20:00 to 23:00)")
names(labels.facet) <- c("Dawn", "Mid-day", "Dusk")
Fig_S7.vpa.timeplot <- ggplot(data = asos.data.dawn.pm.mean.state.week, 
               aes(x = week, y = mean.vpd, col = state)) +
  guides(col = guide_legend(title = "State")) +
  geom_line() +
  theme_bw() +
  labs(tag = "C") +
    theme(panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)),
          panel.grid.major.x = element_line(colour = "lightgrey", size = (0.25)),
        panel.grid.minor.x = element_blank()) +
    ylim(0, 1.5) +
  theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_text(size = 7)) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_text(size = 9)) +
  scale_x_continuous(breaks = 13:22) +
    labs(x = "Weeks from January 1", y = "Average vapour pressure deficit (kPa)") 
Fig_S7.vpa.timeplot <- Fig_S7.vpa.timeplot + facet_grid(. ~ time_of_day_group, labeller = labeller(time_of_day_group = labels.facet)) 

Fig_S7.vpa.timeplot
Vapour pressure deficit for three times of day, averaged from April 1 to May 31 over the years 2015 to 2020 inclusive. Week 13 corresponds to first week of April.

Vapour pressure deficit for three times of day, averaged from April 1 to May 31 over the years 2015 to 2020 inclusive. Week 13 corresponds to first week of April.

Clearly a huge difference between mid-day and the other times


Describe mallard flight data

Calculate median distance travelled by mallards in North America, both overall and for trips that lasted less than a day

mallard.data$dist_m <- mallard.data$Distance..Km. * 1000 #convert to m
viana_median_dist_m <- median(mallard.data$dist_m)
viana_median_dist_m.within.day <- median(mallard.data$dist_m[mallard.data$Time.elapsed..days. == 0])

Get just distances travelled within a day:

mallard.data.within.day <- dplyr::filter(mallard.data, Time.elapsed..days. == 0)

Plot histogram of daily distance:

flight.dist.hist <- ggplot(data = mallard.data.within.day, aes(x = Distance..Km.)) + 
  theme_bw() + 
 # geom_density() +
  geom_histogram(col = "black", fill="darkgrey", 
                 alpha = .1, breaks = seq(0, 600, by = 100)) +
  scale_x_continuous(breaks = seq(0, 600, by = 100)) +
  theme(panel.grid.major.y = element_line(colour="lightgrey", size = (0.25)),
          panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  labs(x="Distance travelled (km)", y = "Number of flights") +
      theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_text(size = 7)) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_text(size = 9)) +
  annotate("text", x = 600, y = 24, label = "B")
flight.dist.hist
Fig. 3B in manuscript: Histogram of distances travelled by mallards in North America in a single day (N = 33)

Fig. 3B in manuscript: Histogram of distances travelled by mallards in North America in a single day (N = 33)


Diatom data

Import diatom data, using 2007 and 2012 NLA data. The latter year only provides occurrences of the genus Nitzschia, whereas the former year provides both the genus occurrences and those of species N. pusilla.

nla.2007.phyto.counts <- read.csv("../rawdata/diatom_lake_data/nla2007_phytoplankton_diatomcount_20091125.txt", 
                            header = T, strip.white = T, stringsAsFactors = F, encoding = "latin1")
nla.2007.lake.data <- read.csv("../rawdata/diatom_lake_data/nla2007_sampledlakeinformation_20091113.txt", 
                            header = T, strip.white = T, stringsAsFactors = F, encoding = "latin1");
nla.2012.lake.data <- read.csv("../rawdata/diatom_lake_data/nla2012_wide_siteinfo_08232016.txt", 
                            header = T, strip.white = T, stringsAsFactors = F, encoding = "latin1")
nla.2012.phyto.counts <- read.csv("../rawdata/diatom_lake_data/nla2012_wide_phytoplankton_count_02122014.txt", 
                            header = T, strip.white = T, stringsAsFactors = F, encoding = "latin1")

Nitzschia occurrence data

Subset to first visit samples (some lakes had multiple visits), and then make pres/abs table

temp <- nla.2007.phyto.counts %>% 
  filter(VISIT_NO == 1)
nla.2007.phyto.presabs <- table(temp$SITE_ID, temp$TAXANAME)

Identify lakes where the genus Nitzschia occurs, then where Nitzschia pusilla Grunow occurs

# genus
Nitzschia.2007.pres <- grep("Nitzschia", dimnames(nla.2007.phyto.presabs)[[2]])
# species
Nitzschia.pusilla.2007.pres <- grep("Nitzschia pusilla Grunow", 
              dimnames(nla.2007.phyto.presabs)[[2]])

Get lake names

Nitzschia.2007.lakes <- row.names(nla.2007.phyto.presabs[,Nitzschia.2007.pres][rowSums(nla.2007.phyto.presabs[,Nitzschia.2007.pres]) > 0,])
Nitzschia.2007.lake.info <- nla.2007.lake.data[match(Nitzschia.2007.lakes, nla.2007.lake.data$SITE_ID),]
Nitzschia.pusilla.2007.lakes <- names(nla.2007.phyto.presabs[,Nitzschia.pusilla.2007.pres][nla.2007.phyto.presabs[,Nitzschia.pusilla.2007.pres] > 0])
Nitzschia.pusilla.2007.lake.info <- nla.2007.lake.data[match(Nitzschia.pusilla.2007.lakes, nla.2007.lake.data$SITE_ID),]

Now 2012 data Subset to first visit samples, and then make pres/abs table

temp <- nla.2012.phyto.counts %>% 
  filter(VISIT_NO == 1)
nla.2012.phyto.presabs <- table(temp$SITE_ID, temp$TARGET_TAXON)

Now for 2012 data, identify lakes where the genus Nitzschia occurs, then where Nitzschia pusilla Grunow occurs

# genus
Nitzschia.2012.pres <- grep("NITZSCHIA", dimnames(nla.2012.phyto.presabs)[[2]])

Get lake names

Nitzschia.2012.lakes <- row.names(nla.2012.phyto.presabs[,Nitzschia.2012.pres][rowSums(nla.2012.phyto.presabs[,Nitzschia.2012.pres]) > 0,])
Nitzschia.2012.lake.info <- nla.2012.lake.data[match(Nitzschia.2012.lakes, nla.2012.lake.data$SITE_ID),]

Now export to file for backup.

write.csv(Nitzschia.2007.lake.info[,c("SITE_ID", "DATE_COL", "LON_DD", "LAT_DD")], 
          "../output/nitzschia_2007_occurrences_NLA.csv")
write.csv(Nitzschia.pusilla.2007.lake.info[,c("SITE_ID", "DATE_COL", "LON_DD", "LAT_DD")], "../output/nitzschia_pusella_2007_occurrences_NLA.csv")
write.csv(Nitzschia.2012.lake.info[,c("SITE_ID", "DATE_COL", "LAT_DD83", "LON_DD83")], 
          "../output/nitzschia_2012_occurrences_NLA.csv")

Now, need to project lat/long for mapping, and the data frame to SF format, see this tutorial here https://datacarpentry.org/r-raster-vector-geospatial/10-vector-csv-to-shapefile-in-r/index.html.

We will assume the Lat/Long information in the CSV file is NAD83.

Nitzschia.locations.2007.sf <- sf::st_as_sf(Nitzschia.2007.lake.info[,
                    c("SITE_ID", "DATE_COL", "LON_DD", "LAT_DD")], 
                    coords = c("LON_DD", "LAT_DD"), 
                    crs = "+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs")
diatom.locations.2012.sf <- sf::st_as_sf(Nitzschia.2012.lake.info[,
                  c("SITE_ID", "DATE_COL", "LAT_DD83", "LON_DD83")], 
                  coords = c("LON_DD83", "LAT_DD83"), 
                  crs = "+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs")
pusilla.locations.2007.sf <- sf::st_as_sf(Nitzschia.pusilla.2007.lake.info[,
                  c("SITE_ID", "DATE_COL", "LON_DD", "LAT_DD")], 
                  coords = c("LON_DD", "LAT_DD"), 
                  crs = "+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs")

Diatom data projection, then intersect with study area

Nitzschia.locations.2007.projected <- sf::st_transform(Nitzschia.locations.2007.sf, 
                                        sf::st_crs(study.area.us.projected))
Nitzschia.locations.2012.projected <- sf::st_transform(diatom.locations.2012.sf, 
                                        sf::st_crs(study.area.us.projected))
pusilla.locations.2007.projected <- sf::st_transform(pusilla.locations.2007.sf, 
                                        sf::st_crs(study.area.us.projected))

Clip to study region

Nitzschia.temp <- sf::st_intersects(Nitzschia.locations.2007.projected, study.area.us.projected)
Nitzschia.locations.2007.cropped <- Nitzschia.locations.2007.projected[
  apply(Nitzschia.temp, 1, function(x){sum(x)}) > 0,]
Nitzschia.temp <- sf::st_intersects(Nitzschia.locations.2012.projected, study.area.us.projected)
Nitzschia.locations.2012.cropped <- Nitzschia.locations.2012.projected[
  apply(Nitzschia.temp, 1, function(x){sum(x)}) > 0,]
pusilla.temp <- sf::st_intersects(pusilla.locations.2007.projected, study.area.us.projected)
pusilla.locations.2007.cropped <- pusilla.locations.2007.projected[
  apply(pusilla.temp, 1, function(x){sum(x)}) > 0,]

Combine the Nitzschia genus locations from 2007 and 2012 NLA data:

Nitzschia.locations <- rbind(Nitzschia.locations.2007.cropped, 
                             Nitzschia.locations.2012.cropped)

Spatial predictions

For spatial predictions, we need the following raster layers:

  • vapour pressure deficit layer
  • a flight time raster layer, with pixels representing flight time from source lakes (which will be the N. pusilla lakes)

We create the flight time layer using a distance layer.

Create VPD layer

For spatial interpolation of VPD across the region, we use beginning of May VPD data, which generally corresponds to week 18. Then get mean VPD by station for both dawn, mid-day, and dusk periods.

asos.data.week18.vpd <- asos.data.dawn.pm %>% dplyr::filter(week == 18) %>% dplyr::group_by(station, time_of_day_group) %>%
  dplyr::summarise(mean.vpd = mean(vpd, na.rm = T))
asos.data.week18.vpd <- dplyr::left_join(asos.data.week18.vpd, asos.stations, by = c("station" = "id"))

Create geospatial SF object from VPD data, using appropriate CRS.

First do dawn hours:

asos.data.week18.vpd.sf <- sf::st_as_sf(asos.data.week18.vpd, coords = c("lon", "lat"), 
                                crs = sf::st_crs(small.lakes))
asos.projected.week18.vpd <- sf::st_transform(asos.data.week18.vpd.sf, 102010)

Now create raster grid on which to predict VPD based on a nearest-neighbour model.

We’ll use a simple inverse distance weighted interpolation model: 3 nearest neighbors and a 2nd order polynomial. See the following online resources: https://rspatial.org/raster/analysis/4-interpolation.html#nearest-neighbour-interpolation.

This makes use of the sf, sp, gstat, and raster packages.

First create sample grid:

grd.vpd  <- as.data.frame(sp::spsample(sf::as_Spatial(asos.projected.week18.vpd), "regular", n = 50000))
names(grd.vpd)  <- c("X", "Y")
sp::coordinates(grd.vpd) <- c("X", "Y")
sp::gridded(grd.vpd) <- TRUE  # Create SpatialPixel object
sp::fullgrid(grd.vpd) <- TRUE  # Create SpatialGrid object
proj4string(grd.vpd) <- sp::proj4string(as_Spatial(asos.projected.week18.vpd))
grd.vpd.raster <- raster::raster(grd.vpd)

Build spatial models:

gs.dawn.vpd <- gstat::gstat(formula = mean.vpd ~ 1, data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dawn"), 
                            locations = as_Spatial(dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dawn")), 
                            nmax = 3, set = list(idp = 2))
gs.mid.vpd <- gstat::gstat(formula = mean.vpd ~ 1, data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Mid-day"), 
                            locations = as_Spatial(dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Mid-day")), 
                            nmax = 3, set = list(idp = 2))
gs.dusk.vpd <- gstat::gstat(formula = mean.vpd ~ 1, data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dusk"), 
                            locations = as_Spatial(dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dusk")), 
                            nmax = 3, set = list(idp = 2))

Interpolate dawn, mid-day, and dusk values, and create raster stack, with band1 = Dawn VPD, and band2 = mid-day and band3 = dusk VPD

gs.vpd.stack <- raster::stack(raster::interpolate(grd.vpd.raster, gs.dawn.vpd), raster::interpolate(grd.vpd.raster, gs.mid.vpd), raster::interpolate(grd.vpd.raster, gs.dusk.vpd))
## [inverse distance weighted interpolation]
## [inverse distance weighted interpolation]
## [inverse distance weighted interpolation]
proj4string(gs.vpd.stack) <- sp::proj4string(as_Spatial(asos.projected.week18.vpd))

Clip to study region

gs.vpd.stack.clip <- raster::mask(gs.vpd.stack, sf::as_Spatial(study.area.outer.border.projected))

Create raster dataframe using ggspatial package (required to map with ggplot)

dawn.vpd.raster.dataframe <- ggspatial::df_spatial(gs.vpd.stack.clip$var1.pred.1)
mid.vpd.raster.dataframe <- ggspatial::df_spatial(gs.vpd.stack.clip$var1.pred.2)
dusk.vpd.raster.dataframe <- ggspatial::df_spatial(gs.vpd.stack.clip$var1.pred.3)

Gather descriptive stats

summary(na.omit(dawn.vpd.raster.dataframe$band1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.07439 0.17777 0.20781 0.20675 0.23511 0.40769
summary(na.omit(mid.vpd.raster.dataframe$band1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6976  1.1471  1.2500  1.2348  1.3320  1.5227
summary(na.omit(dusk.vpd.raster.dataframe$band1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3080  0.5155  0.5642  0.5520  0.5998  0.7776

Visualize the data:

VPD.dawn.map <- ggplot() + 
geom_raster(data = dawn.vpd.raster.dataframe, aes(x = x, y = y, fill = band1)) +
  # make no data values transparent
scale_fill_gradient("VPD (kPa)",
                      low = 'yellow', high = 'blue', limits = c(0.05, 1.75),
                      na.value = NA, guide = guide_colorbar(ticks.colour = "black", 
                              direction = "horizontal", title.position = "top", 
                        ticks.linewidth = 1.5, 
                        title.hjust = .5, frame.colour = "black")) +
      labs(title = "Dawn (04:00 - 07:00)") +
      theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -6)) +
#    theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
  theme(legend.position = c(0.5, 1.1), 
        legend.title = element_text(size = 7.5, face = "bold"), 
        legend.text = element_text(size = 7),
        legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
   geom_sf(data = study.area.us.projected, col = "grey", lwd = 1.2, fill = NA) +
  geom_sf(data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dawn"), col = "black", lwd = 1) +
  labs(x = "Longitude", y = "Latitude") +
   theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_text(size = 7)) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_text(size = 9)) +
  theme(plot.margin =margin(1, 1, 1, 1, "pt"))
VPD.dawn.map
Near-dawn (4-7am) vapour pressure deficit (VPD), averaged over the first week of May for the years 2015 through 2020 inclusive. Green points are *N. pusilla* locations and plus symbols are *Nitzschia* genus locations. Black dots are ASOS stations.

Near-dawn (4-7am) vapour pressure deficit (VPD), averaged over the first week of May for the years 2015 through 2020 inclusive. Green points are N. pusilla locations and plus symbols are Nitzschia genus locations. Black dots are ASOS stations.

Now mid-day:

VPD.mid.map <- ggplot() + 
geom_raster(data = mid.vpd.raster.dataframe, aes(x = x, y = y, fill = band1)) +
  # make no data values transparent
scale_fill_gradient("VPD (kPa)",
                      low = 'yellow', high = 'blue', limits = c(0.05, 1.75),
                      na.value = NA, guide = guide_colorbar(ticks.colour = "black", 
                              direction = "horizontal", title.position = "top", 
                        ticks.linewidth = 1.5, 
                        title.hjust = .5, frame.colour = "black")) +
      labs(title = "Mid-day (12:00 - 15:00)") +
      theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -6)) +
#    theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
  theme(legend.position = c(0.5, 1.1), 
        legend.title = element_text(size = 7.5, face = "bold"), 
        legend.text = element_text(size = 7),
        legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
   geom_sf(data = study.area.us.projected, col = "grey", lwd = 1.2, fill = NA) +
  geom_sf(data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Mid-day"), col = "black", lwd = 1) +
  labs(x = "Longitude", y = "Latitude") +
   theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_blank()) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_blank()) +
  theme(plot.margin =margin(1, 1, 1, 1, "pt"))
VPD.mid.map
Mid-day (noon-3pm) vapour pressure deficit (VPD), averaged over the first week of May for the years 2015 through 2020 inclusive. Green points are *N. pusilla* locations and plus symbols are *Nitzschia* genus locations. Black dots are ASOS stations.

Mid-day (noon-3pm) vapour pressure deficit (VPD), averaged over the first week of May for the years 2015 through 2020 inclusive. Green points are N. pusilla locations and plus symbols are Nitzschia genus locations. Black dots are ASOS stations.

And dusk:

VPD.dusk.map <- ggplot() + 
geom_raster(data = dusk.vpd.raster.dataframe, aes(x = x, y = y, fill = band1)) +
  # make no data values transparent
scale_fill_gradient("VPD (kPa)",
                      low = 'yellow', high = 'blue', limits = c(0.05, 1.75),
                      na.value = NA, guide = guide_colorbar(ticks.colour = "black", 
                              direction = "horizontal", title.position = "top", 
                        ticks.linewidth = 1.5, 
                        title.hjust = .5, frame.colour = "black")) +
      labs(title = "Dusk (20:00 - 23:00)") +
      theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -6)) +
#    theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
  theme(legend.position = c(0.5, 1.1), 
        legend.title = element_text(size = 7.5, face = "bold"), 
        legend.text = element_text(size = 7),
        legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
   geom_sf(data = study.area.us.projected, col = "grey", lwd = 1.2, fill = NA) +
  geom_sf(data = dplyr::filter(asos.projected.week18.vpd, time_of_day_group == "Dusk"), col = "black", lwd = 1) +
  labs(x = "Longitude", y = "Latitude") +
   theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_blank()) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_blank()) +
  theme(plot.margin =margin(1, 1, 1, 1, "pt"))
VPD.dusk.map
Dusk (8-11pm) vapour pressure deficit (VPD), averaged over the first week of May for the years 2015 through 2020 inclusive. Green points are *N. pusilla* locations and plus symbols are *Nitzschia* genus locations. Black dots are ASOS stations.

Dusk (8-11pm) vapour pressure deficit (VPD), averaged over the first week of May for the years 2015 through 2020 inclusive. Green points are N. pusilla locations and plus symbols are Nitzschia genus locations. Black dots are ASOS stations.


Create flight time layer

We’ll use the distanceFromPoints raster function to calculate distance from all points of interest

First, exclude any lakes that are outside of the bounding box of the raster:

# first use relh.clip 
pusilla.locations.2007.cropped.to.raster <- pusilla.locations.2007.cropped[
  st_coordinates(pusilla.locations.2007.cropped)[,1] > sp::bbox(gs.vpd.stack.clip)[1,1],]
Nitzschia.locations.cropped.to.raster <- Nitzschia.locations[
  st_coordinates(Nitzschia.locations)[,1] > sp::bbox(gs.vpd.stack.clip)[1,1],]

Use the N. pusilla occurrence lakes as source lakes.

pusilla.raster <- raster::distanceFromPoints(gs.vpd.stack.clip, as_Spatial(
  pusilla.locations.2007.cropped.to.raster))
pusilla.raster <- raster::mask(pusilla.raster, as_Spatial(study.area.outer.border.projected))

Convert to km

pusilla.raster.km <- pusilla.raster/1000

So now we can use this raster to calculate a new raster representing the time (in minutes) from each source lake, based on flight speed of 69 km/h.

pusilla.raster.flight.time <- (pusilla.raster.km / 69) * 60

Create potential dispersal raster

Now with the flight time and the VPD raster layers, we can use the predict raster function to predict probability of potential dispersal.

First create a raster stack with the two variables (layers)

vpd.dawn.flight.stack <- raster::stack(gs.vpd.stack.clip$var1.pred.1, pusilla.raster.flight.time)
names(vpd.dawn.flight.stack) <- c("vpd", "time_min_num")
vpd.mid.flight.stack <- raster::stack(gs.vpd.stack.clip$var1.pred.2, pusilla.raster.flight.time)
names(vpd.mid.flight.stack) <- c("vpd", "time_min_num")
vpd.dusk.flight.stack <- raster::stack(gs.vpd.stack.clip$var1.pred.3, pusilla.raster.flight.time)
names(vpd.dusk.flight.stack) <- c("vpd", "time_min_num")

Now make a probability of potential dispersal layer based on the GLM:

dispersal.dawn.prob.raster <- raster::predict(vpd.dawn.flight.stack, glm.full.vpd, type = "response")
dispersal.mid.prob.raster <- raster::predict(vpd.mid.flight.stack, glm.full.vpd, type = "response")
dispersal.dusk.prob.raster <- raster::predict(vpd.dusk.flight.stack, glm.full.vpd, type = "response")

Convert the raster to a dataframe using ggspatial function df_spatial

dispersal.dawn.prob.dataframe <- ggspatial::df_spatial(dispersal.dawn.prob.raster)
dispersal.mid.prob.dataframe <- ggspatial::df_spatial(dispersal.mid.prob.raster)
dispersal.dusk.prob.dataframe <- ggspatial::df_spatial(dispersal.dusk.prob.raster)

Create final map figures

Create overview map

overview.map <- ggplot() + 
  theme_bw() +
   geom_sf(data = north.america.lamb, col = "grey", fill = NA, alpha = 0.4) +
 geom_sf(data = north.america.lamb.outer, fill = NA, col = "darkgrey") +
  geom_sf(data = study.area.outer.border, col = "black", fill = NA) +
    coord_sf(datum = NA) +
  theme(plot.margin =margin(1, 1, 1, 1, "pt"))
overview.map

Create dispersal probability map, showing probability of successful dispersal from source N. pusilla lakes.

NOTE The color scales are situated outside the map area, and don’t show up. This is on purpose. When we assemble the three maps into a single figure (at bottom of script), we’ll reveal the colour scale!

First for dawn:

disp.prob.map.dawn <- ggplot() +
  theme_bw() +
  geom_tile(data = dispersal.dawn.prob.dataframe, aes(x = x, y = y, fill = band1)) +
   scale_fill_gradient(low = 'yellow', high = 'darkgreen',limits = c(0, 1), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1.0), 
                      na.value = NA, guide = guide_colorbar(
                        title = "Probability of potential dispersal", 
                        direction = "horizontal", title.position = "top", 
                        ticks.colour = "black", ticks.linewidth = 1.5, 
                        title.hjust = .5, frame.colour = "black")) +
   labs(title = "Dawn (04:00 - 07:00)") +
  theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -8)) +
#    theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
  theme(legend.position = c(0.5, 1.1), 
        legend.title = element_text(size = 7.5, face = "bold"), 
        legend.text = element_text(size = 7),
        legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
   geom_sf(data = study.area.us.projected, fill = NA) +
  geom_sf(data = study.area.lakes.projected, col = "blue") +
  geom_sf(data = Nitzschia.locations, shape = 5, fill = "blue", size = 2.2, lwd = 1.5) + 
  geom_sf(data = pusilla.locations.2007.cropped, shape = 23, size = 2.3, fill = "red") +
  coord_sf(xlim = c(-650000, 80000), ylim = c(-10000, 1000000)) + 
  labs(x = "Longitude", y = "Latitude") +
   theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_text(size = 7)) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_text(size = 9)) +
  theme(plot.margin =margin(1, 1, 1, 1, "pt"))
## Warning: Duplicated aesthetics after name standardisation: size
disp.prob.map.dawn <- disp.prob.map.dawn +
    ggsn::scalebar(study.area.us.projected, st.size = 1.5, 
                   dist = 50, transform = F, border.size = 0.2, 
                   dist_unit = "km", model = "GRS80", location = "bottomleft", 
                   box.fill = c("darkgrey", "white"))
disp.prob.map.dawn
Predicted probability of potential diatom dispersal: near dawn

Predicted probability of potential diatom dispersal: near dawn

Now for mid-day:

disp.prob.map.mid <- ggplot() +
  theme_bw() +
  geom_tile(data = dispersal.mid.prob.dataframe, aes(x = x, y = y, fill = band1)) +
   scale_fill_gradient(low = 'yellow', high = 'darkgreen',limits = c(0, 1), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1.0), 
                      na.value = NA, guide = guide_colorbar(
                        title = "Probability of potential dispersal", 
                        direction = "horizontal", title.position = "top", 
                        ticks.colour = "black", ticks.linewidth = 1.5, 
                        title.hjust = .5, frame.colour = "black")) +
   labs(title = "Mid-day (12:00 - 15:00)") +
    theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -8)) +
#    theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
  theme(legend.position = c(0.5, 1.1), 
        legend.title = element_text(size = 7.5, face = "bold"), 
        legend.text = element_text(size = 7),
        legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
   geom_sf(data = study.area.us.projected, fill = NA) +
  geom_sf(data = study.area.lakes.projected, col = "blue") +
  geom_sf(data = Nitzschia.locations, shape = 5, fill = "blue", size = 2.2, lwd = 1.5) + 
  geom_sf(data = pusilla.locations.2007.cropped, shape = 23, size = 2.3, fill = "red") +
  coord_sf(xlim = c(-650000, 80000), ylim = c(-10000, 1000000)) + 
  labs(x = "Longitude", y = "Latitude") +
   theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_blank()) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_blank()) +
  theme(plot.margin =margin(1, 1, 1, 1, "pt"))
## Warning: Duplicated aesthetics after name standardisation: size
disp.prob.map.mid <- disp.prob.map.mid +
    ggsn::scalebar(study.area.us.projected, st.size = 1.5, 
                   dist = 50, transform = F, border.size = 0.2, 
                   dist_unit = "km", model = "GRS80", location = "bottomleft", 
                   box.fill = c("darkgrey", "white"))
disp.prob.map.mid
Predicted probability of potential diatom dispersal: mid-day

Predicted probability of potential diatom dispersal: mid-day

And dusk:

disp.prob.map.dusk <- ggplot() +
  theme_bw() +
  geom_tile(data = dispersal.dusk.prob.dataframe, aes(x = x, y = y, fill = band1)) +
   scale_fill_gradient(low = 'yellow', high = 'darkgreen',limits = c(0, 1), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1.0), 
                      na.value = NA, guide = guide_colorbar(
                        title = "Probability of potential dispersal", 
                        direction = "horizontal", title.position = "top", 
                        ticks.colour = "black", ticks.linewidth = 1.5, 
                        title.hjust = .5, frame.colour = "black")) +
    labs(title = "Dusk (20:00 - 23:00)") +
      theme(plot.title = element_text(size = 7.5, face = "bold", hjust = 0.5, vjust = -8)) +
#    theme(plot.tag.position = c(0.17, 0.95), plot.title.position = "panel") +
  theme(legend.position = c(0.5, 1.1), 
        legend.title = element_text(size = 7.5, face = "bold"), 
        legend.text = element_text(size = 7),
        legend.key.height = unit(0.4, "cm"), legend.key.width = unit(0.95, "cm")) +
   geom_sf(data = study.area.us.projected, fill = NA) +
  geom_sf(data = study.area.lakes.projected, col = "blue") +
  geom_sf(data = Nitzschia.locations, shape = 5, fill = "blue", size = 2.2, lwd = 1.5) + 
  geom_sf(data = pusilla.locations.2007.cropped, shape = 23, size = 2.3, fill = "red") +
  coord_sf(xlim = c(-650000, 80000), ylim = c(-10000, 1000000)) + 
  labs(x = "Longitude", y = "Latitude") +
   theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_blank()) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_blank()) +
  theme(plot.margin =margin(1, 1, 1, 1, "pt"))
## Warning: Duplicated aesthetics after name standardisation: size
disp.prob.map.dusk <- disp.prob.map.dusk +
    ggsn::scalebar(study.area.us.projected, st.size = 1.5, 
                   dist = 50, transform = F, border.size = 0.2, 
                   dist_unit = "km", model = "GRS80", location = "bottomleft", 
                   box.fill = c("darkgrey", "white"))
disp.prob.map.dusk
Predicted probability of potential diatom dispersal: dusk

Predicted probability of potential diatom dispersal: dusk

Now extract the dispersal probability values for each Nitzschia occurrence lake, and visualize with a histogram.

For this we’ll exclude the N. pusilla lakes, as these are considered the sources.

all.Nitzschia.lakes.minus.pusilla <- Nitzschia.locations[match(Nitzschia.locations$SITE_ID, setdiff(Nitzschia.locations$SITE_ID, pusilla.locations.2007.cropped$SITE_ID))[!is.na(match(Nitzschia.locations$SITE_ID, setdiff(Nitzschia.locations$SITE_ID, pusilla.locations.2007.cropped$SITE_ID)))],]

Get probabilities:

Nitzschia.lake.disp.probs <- data.frame(x = st_coordinates(
  all.Nitzschia.lakes.minus.pusilla)[,1], y = st_coordinates(
    all.Nitzschia.lakes.minus.pusilla)[,2], 
  time = factor(rep(c("Dawn", "Mid-day", "Dusk"), each = length(raster::extract(dispersal.dawn.prob.raster, as_Spatial(all.Nitzschia.lakes.minus.pusilla)))), levels = c("Dawn", "Mid-day", "Dusk")),
  dp = c(raster::extract(dispersal.dawn.prob.raster, as_Spatial(all.Nitzschia.lakes.minus.pusilla)),
         raster::extract(dispersal.mid.prob.raster, as_Spatial(all.Nitzschia.lakes.minus.pusilla)),
         raster::extract(dispersal.dusk.prob.raster, as_Spatial(all.Nitzschia.lakes.minus.pusilla))))
## Warning in data.frame(x = st_coordinates(all.Nitzschia.lakes.minus.pusilla)
## [, : row names were found from a short variable and have been discarded

Now a boxplot comparing dispersal probabilities across times of day:

Fig5.dp.boxplot <- ggplot(Nitzschia.lake.disp.probs, aes(y = dp, x = time)) +
 theme_bw() + 
  geom_violin(col = "black") +
  geom_jitter(col = "grey", alpha = 0.5, width = 0.2) +
  ylim(0, 1) +
  theme(panel.grid.minor.x = element_blank(), 
        panel.grid.minor.y = element_blank(),  panel.grid.major.y = element_line(colour = "lightgrey", size = (0.25)), 
        panel.grid.major.x = element_blank(), plot.tag.position = c(0.1, 0.95)) +
  labs(y = "Probability of potential dispersal", x = "Time of day") +
  theme(axis.text.x = element_text(size = 7)) +
  theme(axis.text.y = element_text(size = 7)) +
  theme(axis.title.x = element_text(size = 9)) +
  theme(axis.title.y = element_text(size = 9))
Fig5.dp.boxplot
Fig. 5 in manuscript: Probability of potential dispersal into lakes that host *Nitzschia* diatoms (n = 80) in relation to the time of day

Fig. 5 in manuscript: Probability of potential dispersal into lakes that host Nitzschia diatoms (n = 80) in relation to the time of day

View and write figure files

Main figures

Fig_01

Figure 1 is a photograph:

Fig_02

model.plot

Export Fig_02 for use in pub in PDF and tiff format:

pdf("../output/figures/Fig_02.pdf", width = 6, height = 4)
model.plot
dev.off()
## quartz_off_screen 
##                 2
tiff("../output/figures/Fig_02.tiff", width = 6, height = 4, res = 300, units = "in")
model.plot
dev.off()
## quartz_off_screen 
##                 2

Fig_03

Combine figures 3A, 3B, and 3C for manuscript using the patchwork package

Fig3A.dist.prob.vpd / (flight.dist.hist + neighbour.dist.hist)

#tiff(file = "../output/figures/Fig_03.tiff", height = 8, width = 7, units = "in", res = 300)
pdf(file = "../output/figures/Fig_03.pdf", height = 8, width = 7)
Fig3A.dist.prob.vpd / (flight.dist.hist + neighbour.dist.hist)
dev.off()
## quartz_off_screen 
##                 2
tiff(file = "../output/figures/Fig_03.tiff", height = 8, width = 7, units = "in", res = 300)
Fig3A.dist.prob.vpd / (flight.dist.hist + neighbour.dist.hist)
dev.off()
## quartz_off_screen 
##                 2

Fig_04

See this webpage for manipulating positioning of scales and graphs using patchwork:

https://patchwork.data-imaginist.com/articles/guides/layout.html

require(patchwork)
layout <- c(
  area(t = 1, l = 3, b = 2.75, r = 4),
  area(t = 1.75, l = 4, b = 2.75, r = 9),
  area(t = 3, l = 1, b = 7, r = 3),
  area(t = 3, l = 4, b = 7, r = 6),
   area(t = 3, l =  7, b = 7, r = 9)
)

Fig_04 <- overview.map + guide_area() + disp.prob.map.dawn + disp.prob.map.mid + disp.prob.map.dusk + plot_layout(guides = "collect", design = layout)  
Fig_04

pdf(file = "../output/figures/Fig_04.pdf", height = 6, width = 8)
Fig_04
dev.off()
## quartz_off_screen 
##                 2
tiff(file = "../output/figures/Fig_04.tiff", height = 6, width = 8, units = "in", res = 300)
#pdf(file = "../output/figures/Fig_04.pdf", height = 6, width = 8)
Fig_04
dev.off()
## quartz_off_screen 
##                 2

Fig_05

Violin plot of dispersal probabilities

Fig5.dp.boxplot
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).
## Warning: Removed 4 rows containing missing values (geom_point).

pdf(file = "../output/figures/Fig_05.pdf", height = 5, width = 3.5)
Fig5.dp.boxplot
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).
## Warning: Removed 4 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2

TIFF

tiff(file = "../output/figures/Fig_05.tiff", height = 5, width = 3.5, units = "in", res = 300)
Fig5.dp.boxplot
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).
## Warning: Removed 4 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen 
##                 2

Supplemental figures

Note that Figures S1 and S2 are photographs, associated with these files stored in the “./images” directory:

  • “Fig_S1.tiff”
  • “Fig_S2.tiff”

Fig_S3

Now combine figures A3a, A3b, and A3c for manuscript using the patchwork package

Fig_S3a / Fig_S3b / Fig_S3c

pdf(file = "../output/figures/Fig_S3.pdf", height = 10, width = 7)
Fig_S3a / Fig_S3b / Fig_S3c
dev.off()
## quartz_off_screen 
##                 2
tiff(file = "../output/figures/Fig_S3.tiff", height = 10, width = 7, units = "in", res = 300)
Fig_S3a / Fig_S3b / Fig_S3c
## Warning: Computation failed in `stat_summary()`:
## Hmisc package required for this function

## Warning: Computation failed in `stat_summary()`:
## Hmisc package required for this function

## Warning: Computation failed in `stat_summary()`:
## Hmisc package required for this function

## Warning: Computation failed in `stat_summary()`:
## Hmisc package required for this function

## Warning: Computation failed in `stat_summary()`:
## Hmisc package required for this function

## Warning: Computation failed in `stat_summary()`:
## Hmisc package required for this function
dev.off()
## quartz_off_screen 
##                 2

Fig_S4

investr::plotFit(cal.curve.4pl,
     xlab = "Diatom cell count (cells/mL)",
     ylab = "Relative Spectrofluorometer Intensity (arbitrary units)",
     pch = 1, ylim = c(-10, 160), las = 1)
segments(-50000000, max.int, 1000000000, max.int, lty = 1, col = "red")
segments(-50000000, min.int,min.dens, min.int, lty = 1, col = "red")
arrows(min.dens, min.int, min.dens, -15, lty = 1, length = 0.15, angle = 20, col = "red")
text(min.dens + 10000, -10, labels = "Predicted count day 8", col = "red", pos = 4, cex = 0.8)
text(-30000, max.int - 8, labels = "Measured intensity day 1", col = "red", pos = 4, cex = 0.8)
text(-30000, min.int + 8, labels = "Measured intensity day 8", col = "red", pos = 4, cex = 0.8)

Print to PDF

pdf("../output/figures/Fig_S4.pdf", width = 8, height = 6.5)
investr::plotFit(cal.curve.4pl,
     xlab = "Diatom cell count (cells/mL)",
     ylab = "Relative Spectrofluorometer Intensity (arbitrary units)",
     pch = 1, ylim = c(-10, 160), las = 1)
segments(-50000000, max.int, 1000000000, max.int, lty = 1, col = "red")
segments(-50000000, min.int,min.dens, min.int, lty = 1, col = "red")
arrows(min.dens, min.int, min.dens, -15, lty = 1, length = 0.15, angle = 20, col = "red")
text(min.dens + 10000, -10, labels = "Predicted count day 8", col = "red", pos = 4, cex = 0.8)
text(-30000, max.int - 8, labels = "Measured intensity day 1", col = "red", pos = 4, cex = 0.8)
text(-30000, min.int + 8, labels = "Measured intensity day 8", col = "red", pos = 4, cex = 0.8)
dev.off()
## quartz_off_screen 
##                 2

TIFF

tiff("../output/figures/Fig_S4.tiff", width = 8, height = 6.5, units = "in", res = 300)
investr::plotFit(cal.curve.4pl,
     xlab = "Diatom cell count (cells/mL)",
     ylab = "Relative Spectrofluorometer Intensity (arbitrary units)",
     pch = 1, ylim = c(-10, 160), las = 1)
segments(-50000000, max.int, 1000000000, max.int, lty = 1, col = "red")
segments(-50000000, min.int,min.dens, min.int, lty = 1, col = "red")
arrows(min.dens, min.int, min.dens, -15, lty = 1, length = 0.15, angle = 20, col = "red")
text(min.dens + 10000, -10, labels = "Predicted count day 8", col = "red", pos = 4, cex = 0.8)
text(-30000, max.int - 8, labels = "Measured intensity day 1", col = "red", pos = 4, cex = 0.8)
text(-30000, min.int + 8, labels = "Measured intensity day 8", col = "red", pos = 4, cex = 0.8)
dev.off()
## quartz_off_screen 
##                 2

Fig_S5

Fig_S5

Print to PDF

pdf(file = "../output/figures/Fig_S5.pdf", height = 6, width = 4)
Fig_S5
dev.off()
## quartz_off_screen 
##                 2

Print to TIFF

tiff(file = "../output/figures/Fig_S5.tiff", height = 6, width = 4, units = "in", res = 300)
Fig_S5
dev.off()
## quartz_off_screen 
##                 2

Fig_S6

Print to PDF

Fig_S6_vpd.glm.plot

Print to PDF

pdf(file = "../output/figures/Fig_S6.pdf", height = 4, width = 6)
Fig_S6_vpd.glm.plot
dev.off()
## quartz_off_screen 
##                 2

Print to TIFF

tiff(file = "../output/figures/Fig_S6.tiff", height = 4, width = 6, units = "in", res = 300)
Fig_S6_vpd.glm.plot
dev.off()
## quartz_off_screen 
##                 2

Fig_S7

Combine three plots using patchwork

require(patchwork)
Fig_S7 <- Fig_S7.temp.timeplot / Fig_S7.rh.timeplot / Fig_S7.vpa.timeplot
Fig_S7

Print to PDF

pdf(file = "../output/figures/Fig_S7.pdf", height = 10, width = 6)
Fig_S7
dev.off()
## quartz_off_screen 
##                 2

Print to TIFF

tiff(file = "../output/figures/Fig_S7.tiff", height = 10, width = 6, units = "in", res = 300)
Fig_S7
dev.off()
## quartz_off_screen 
##                 2

Fig_S8

Maps of VPD

require(patchwork)
layout <- c(
  area(t = 1, l = 3, b = 2.75, r = 4),
  area(t = 1.75, l = 4, b = 2.75, r = 9),
  area(t = 3, l = 1, b = 7, r = 3),
  area(t = 3, l = 4, b = 7, r = 6),
   area(t = 3, l =  7, b = 7, r = 9)
)

Fig_S8 <- overview.map + guide_area() + VPD.dawn.map + VPD.mid.map + VPD.dusk.map + plot_layout(guides = "collect", design = layout)
Fig_S8
## Warning: Removed 7951 rows containing missing values (geom_raster).

## Warning: Removed 7951 rows containing missing values (geom_raster).

## Warning: Removed 7951 rows containing missing values (geom_raster).

PDF

pdf(file = "../output/figures/Fig_S8.pdf", height = 6, width = 8)
Fig_S8
## Warning: Removed 7951 rows containing missing values (geom_raster).

## Warning: Removed 7951 rows containing missing values (geom_raster).

## Warning: Removed 7951 rows containing missing values (geom_raster).
dev.off()
## quartz_off_screen 
##                 2

TIFF

tiff(file = "../output/figures/Fig_S8.tiff", height = 6, width = 8, units = "in", res = 300)
Fig_S8
## Warning: Removed 7951 rows containing missing values (geom_raster).

## Warning: Removed 7951 rows containing missing values (geom_raster).

## Warning: Removed 7951 rows containing missing values (geom_raster).
dev.off()
## quartz_off_screen 
##                 2